Spaces:

ghost233lism
/

DepthAnything-AC

Running

App Files Files Community

ghost233lism commited on Jul 1, 2025

Commit

343f44f

verified ·

1 Parent(s): b1afa51

Upload app.py

Browse files

Files changed (1) hide show

app.py +411 -503

app.py CHANGED Viewed

@@ -1,504 +1,412 @@
-import gradio as gr
-import os
-import cv2
-import numpy as np
-import torch
-import torch.nn.functional as F
-from PIL import Image
-import tempfile
-import io
-from depth_anything.dpt import DepthAnything_AC
-def normalize_depth(disparity_tensor):
-    """Standard normalization method to convert disparity to depth"""
-    eps = 1e-6
-    disparity_min = disparity_tensor.min()
-    disparity_max = disparity_tensor.max()
-    normalized_disparity = (disparity_tensor - disparity_min) / (disparity_max - disparity_min + eps)
-    return normalized_disparity
-def is_video_file(filepath):
-    """Check if the given file is a video file based on its extension"""
-    if filepath is None:
-        return False
-    video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm', '.m4v']
-    _, ext = os.path.splitext(filepath.lower())
-    return ext in video_extensions
-def load_model(model_path='checkpoints/depth_anything_AC_vits.pth', encoder='vits'):
-    """Load trained depth estimation model"""
-    model_configs = {
-        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024], 'version': 'v2'},
-        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768], 'version': 'v2'},
-        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384], 'version': 'v2'}
-    }
-    model = DepthAnything_AC(model_configs[encoder])
-    if os.path.exists(model_path):
-        checkpoint = torch.load(model_path, map_location='cpu')
-        model.load_state_dict(checkpoint, strict=False)
-    else:
-        print(f"Warning: Model file {model_path} not found")
-    model.eval()
-    if torch.cuda.is_available():
-        model.cuda()
-    return model
-def preprocess_image(image, target_size=518):
-    """Preprocess input image (supports both PIL Image and numpy array)"""
-    if isinstance(image, str):
-        raw_image = cv2.imread(image)
-        if raw_image is None:
-            raise ValueError(f"Cannot read image: {image}")
-        image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
-    elif isinstance(image, Image.Image):
-        image = np.array(image)
-        image = image.astype(np.float32) / 255.0
-    elif isinstance(image, np.ndarray):
-        if image.dtype == np.uint8:
-            image = image.astype(np.float32) / 255.0
-    else:
-        raise ValueError(f"Unsupported image type: {type(image)}")
-    if len(image.shape) == 3 and image.shape[2] == 3:
-        pass
-    elif len(image.shape) == 3 and image.shape[2] == 4:
-        image = image[:, :, :3]
-    h, w = image.shape[:2]
-    scale = target_size / min(h, w)
-    new_h, new_w = int(h * scale), int(w * scale)
-    new_h = ((new_h + 13) // 14) * 14
-    new_w = ((new_w + 13) // 14) * 14
-    image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
-    mean = np.array([0.485, 0.456, 0.406])
-    std = np.array([0.229, 0.224, 0.225])
-    image = (image - mean) / std
-    image = torch.from_numpy(image.transpose(2, 0, 1)).float()
-    image = image.unsqueeze(0)
-    return image, (h, w)
-def postprocess_depth(depth_tensor, original_size):
-    """Post-process depth map"""
-    if depth_tensor.dim() == 3:
-        depth_tensor = depth_tensor.unsqueeze(1)
-    elif depth_tensor.dim() == 2:
-        depth_tensor = depth_tensor.unsqueeze(0).unsqueeze(1)
-    h, w = original_size
-    depth = F.interpolate(depth_tensor, size=(h, w), mode='bilinear', align_corners=True)
-    depth = depth.squeeze().cpu().numpy()
-    return depth
-def create_colored_depth_map(depth, colormap='spectral'):
-    """Create colored depth map"""
-    if colormap == 'inferno':
-        depth_colored = cv2.applyColorMap((depth * 255).astype(np.uint8), cv2.COLORMAP_INFERNO)
-        depth_colored = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB)
-    elif colormap == 'spectral':
-        from matplotlib import cm
-        spectral_cmap = cm.get_cmap('Spectral_r')
-        depth_colored = (spectral_cmap(depth) * 255).astype(np.uint8)
-        depth_colored = depth_colored[:, :, :3]
-    else:
-        depth_colored = (depth * 255).astype(np.uint8)
-        depth_colored = np.stack([depth_colored] * 3, axis=2)
-    return depth_colored
-def process_video(video_path, colormap_choice, progress=gr.Progress()):
-    """Process video file for depth estimation"""
-    try:
-        print(f"Processing video: {video_path}")
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise ValueError(f"Cannot open video file: {video_path}")
-        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        input_fps = cap.get(cv2.CAP_PROP_FPS)
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        print(f"Video properties: {total_frames} frames, {input_fps} FPS, {width}x{height}")
-        temp_output = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
-        output_path = temp_output.name
-        temp_output.close()
-        fourcc = cv2.VideoWriter.fourcc(*'mp4v')
-        out = cv2.VideoWriter(output_path, fourcc, input_fps, (width, height))
-        if not out.isOpened():
-            cap.release()
-            raise ValueError("Cannot create output video file")
-        frame_count = 0
-        try:
-            while True:
-                ret, frame = cap.read()
-                if not ret:
-                    break
-                frame_count += 1
-                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                try:
-                    image_tensor, original_size = preprocess_image(frame_rgb)
-                    if torch.cuda.is_available():
-                        image_tensor = image_tensor.cuda()
-                    with torch.no_grad():
-                        prediction = model(image_tensor)
-                        disparity_tensor = prediction['out']
-                        depth_tensor = normalize_depth(disparity_tensor)
-                    depth = postprocess_depth(depth_tensor, original_size)
-                    if depth is None:
-                        if depth_tensor.dim() == 1:
-                            h, w = original_size
-                            expected_size = h * w
-                            if depth_tensor.shape[0] == expected_size:
-                                depth_tensor = depth_tensor.view(1, 1, h, w)
-                            else:
-                                import math
-                                side_length = int(math.sqrt(depth_tensor.shape[0]))
-                                if side_length * side_length == depth_tensor.shape[0]:
-                                    depth_tensor = depth_tensor.view(1, 1, side_length, side_length)
-                        depth = postprocess_depth(depth_tensor, original_size)
-                    if depth is None:
-                        print(f"Warning: Frame {frame_count} processing failed, using black frame")
-                        depth_frame = np.zeros((height, width, 3), dtype=np.uint8)
-                    else:
-                        if colormap_choice.lower() == 'inferno':
-                            depth_frame = cv2.applyColorMap((depth * 255).astype(np.uint8), cv2.COLORMAP_INFERNO)
-                        elif colormap_choice.lower() == 'spectral':
-                            from matplotlib import cm
-                            spectral_cmap = cm.get_cmap('Spectral_r')
-                            depth_frame = (spectral_cmap(depth) * 255).astype(np.uint8)
-                            depth_frame = depth_frame[:, :, :3]
-                            depth_frame = cv2.cvtColor(depth_frame, cv2.COLOR_RGB2BGR)
-                        else:
-                            depth_frame = (depth * 255).astype(np.uint8)
-                            depth_frame = cv2.cvtColor(depth_frame, cv2.COLOR_GRAY2BGR)
-                    out.write(depth_frame)
-                except Exception as e:
-                    print(f"Error processing frame {frame_count}: {str(e)}")
-                    black_frame = np.zeros((height, width, 3), dtype=np.uint8)
-                    out.write(black_frame)
-                progress((frame_count / total_frames), f"Processing progress: {frame_count}/{total_frames} frames")
-        except Exception as e:
-            print(f"Unexpected error during video processing: {str(e)}")
-        finally:
-            cap.release()
-            out.release()
-        print(f"Video processing completed! Output saved to: {output_path}")
-        return output_path
-    except Exception as e:
-        print(f"Video processing failed: {str(e)}")
-        return None
-print("Loading model...")
-model = load_model()
-print("Model loaded successfully!")
-def predict_depth(input_file, colormap_choice):
-    """Main depth prediction function for both images and videos"""
-    try:
-        if input_file is None:
-            return None, gr.update(visible=False)
-        if is_video_file(input_file):
-            output_path = process_video(input_file, colormap_choice)
-            if output_path:
-                return output_path, gr.update(visible=True, value=output_path)
-            else:
-                return None, gr.update(visible=False)
-        else:
-            if isinstance(input_file, str):
-                input_image = Image.open(input_file)
-            else:
-                input_image = input_file
-            image_tensor, original_size = preprocess_image(input_image)
-            if torch.cuda.is_available():
-                image_tensor = image_tensor.cuda()
-            with torch.no_grad():
-                prediction = model(image_tensor)
-                disparity_tensor = prediction['out']
-                depth_tensor = normalize_depth(disparity_tensor)
-            depth = postprocess_depth(depth_tensor, original_size)
-            depth_colored = create_colored_depth_map(depth, colormap_choice.lower())
-            result = Image.fromarray(depth_colored)
-            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
-            result.save(temp_file.name)
-            return result, gr.update(visible=True, value=temp_file.name)
-    except Exception as e:
-        print(f"Error during inference: {str(e)}")
-        return None, gr.update(visible=False)
-def capture_and_predict(camera_image, colormap_choice):
-    """Capture image from camera and predict depth"""
-    return predict_depth(camera_image, colormap_choice)
-with gr.Blocks(title="Depth Anything AC - Depth Estimation Demo", theme=gr.themes.Soft(), css="""
-    .image-container {
-        display: flex !important;
-        align-items: flex-start !important;
-        justify-content: center !important;
-    }
-    .gradio-image {
-        vertical-align: top !important;
-    }
-""") as demo:
-    gr.Markdown("""
-    # 🌊 Depth Anything AC - Depth Estimation Demo
-    Upload an image or use your camera to generate corresponding depth maps! Different colors in the depth map represent different distances, allowing you to see the three-dimensional structure of the image.
-    ## How to Use
-    1. **Upload Mode**: Click the upload area to select an image or video file
-    2. **Camera Mode**: Use your camera to capture a live image
-    3. Choose your preferred colormap style
-    4. Click the "Generate Depth Map" button
-    5. View the results and download
-    """)
-    with gr.Row():
-        input_source = gr.Radio(
-            choices=["Upload Image", "Upload Video", "Use Camera"],
-            value="Upload Image",
-            label="Input Source"
-        )
-        colormap_choice = gr.Dropdown(
-            choices=["Spectral", "Inferno", "Gray"],
-            value="Spectral",
-            label="Colormap Style"
-        )
-        submit_btn = gr.Button(
-            "🎯 Generate Depth Map",
-            variant="primary",
-            size="lg"
-        )
-    with gr.Row():
-        gr.HTML("<h3 style='text-align: center; margin: 10px;'>📷 Input Image</h3>")
-        gr.HTML("<h3 style='text-align: center; margin: 10px;'>🌊 Depth Map Result</h3>")
-    with gr.Row(equal_height=True):
-        with gr.Column(scale=1):
-            # Image input component for preview and examples
-            upload_image = gr.Image(
-                type="pil",
-                height=450,
-                visible=True,
-                show_label=False,
-                container=False,
-                label="Upload Image"
-            )
-            # File component for video uploads
-            upload_file = gr.File(
-                file_types=["video"],
-                height=200,
-                visible=False,
-                show_label=False,
-                container=False,
-                label="Upload Video"
-            )
-            # Camera component
-            camera_image = gr.Image(
-                type="pil",
-                sources=["webcam"],
-                height=450,
-                visible=False,
-                show_label=False,
-                container=False
-            )
-        with gr.Column(scale=1):
-            output_file = gr.File(
-                height=450,
-                show_label=False,
-                container=False,
-                visible=False
-            )
-            output_image = gr.Image(
-                type="pil",
-                height=450,
-                show_label=False,
-                container=False,
-                visible=True
-            )
-            download_btn = gr.DownloadButton(
-                label="📥 Download Result",
-                variant="secondary",
-                size="sm",
-                visible=False
-            )
-    def switch_input_source(source):
-        if source == "Upload Image":
-            return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
-        elif source == "Upload Video":
-            return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
-        else:  # Use Camera
-            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
-    input_source.change(
-        fn=switch_input_source,
-        inputs=[input_source],
-        outputs=[upload_image, upload_file, camera_image]
-    )
-    def handle_prediction(input_source, upload_img, upload_file_path, camera_img, colormap):
-        if input_source == "Upload Image":
-            if upload_img is None:
-                return None, None, gr.update(visible=False), gr.update(visible=False)
-            result, download_update = predict_depth(upload_img, colormap)
-            return result, None, gr.update(visible=True), download_update
-        elif input_source == "Upload Video":
-            if upload_file_path is None:
-                return None, None, gr.update(visible=False), gr.update(visible=False)
-            result, download_update = predict_depth(upload_file_path, colormap)
-            if isinstance(result, str) and is_video_file(result):
-                return None, result, gr.update(visible=False), download_update
-            else:
-                return result, None, gr.update(visible=True), download_update
-        else:  # Use Camera
-            result, download_update = predict_depth(camera_img, colormap)
-            return result, None, gr.update(visible=True), download_update
-    # Separate image and video examples
-    image_examples = []
-    video_examples = []
-    if os.path.exists("toyset"):
-        for img_file in ["1.png", "2.png", "good.png"]:
-            if os.path.exists(f"toyset/{img_file}"):
-                image_examples.append([f"toyset/{img_file}", "Spectral"])
-        for vid_file in ["fog_2_processed_1s-6s_1.0x.mp4", "snow_processed_1s-6s_1.0x.mp4"]:
-            if os.path.exists(f"toyset/{vid_file}"):
-                video_examples.append([f"toyset/{vid_file}", "Spectral"])
-    # Function to handle video example selection and auto-switch mode
-    def handle_video_example(video_path, colormap):
-        # Auto-switch to video mode and return the necessary updates
-        return (
-            "Upload Video",  # input_source
-            gr.update(visible=False),  # upload_image
-            gr.update(visible=True, value=video_path),  # upload_file
-            gr.update(visible=False)  # camera_image
-        )
-    # Function to handle image example selection and auto-switch mode
-    def handle_image_example(image, colormap):
-        # Auto-switch to image mode and process the image
-        result = predict_depth(image, colormap)
-        output_image = result[0] if result[0] is not None else None
-        return (
-            "Upload Image",  # input_source
-            gr.update(visible=True, value=image),  # upload_image
-            gr.update(visible=False),  # upload_file
-            gr.update(visible=False),  # camera_image
-            output_image  # output_image
-        )
-    if image_examples:
-        gr.Examples(
-            examples=image_examples,
-            inputs=[upload_image, colormap_choice],
-            outputs=[input_source, upload_image, upload_file, camera_image, output_image],
-            fn=handle_image_example,
-            cache_examples=False,
-            label="Try these example images"
-        )
-    if video_examples:
-        gr.Examples(
-            examples=video_examples,
-            inputs=[upload_file, colormap_choice],
-            outputs=[input_source, upload_image, upload_file, camera_image],
-            fn=handle_video_example,
-            cache_examples=False,
-            label="Try these example videos"
-        )
-    submit_btn.click(
-        fn=handle_prediction,
-        inputs=[input_source, upload_image, upload_file, camera_image, colormap_choice],
-        outputs=[output_image, output_file, output_image, download_btn],
-        show_progress=True
-    )
-    gr.Markdown("""
-    ## 📝 Colormap Description
-    - **Spectral**: Rainbow spectrum, with clear contrast between near and far
-    - **Inferno**: Fire spectrum, warm tones
-    - **Gray**: Classic grayscale depth representation
-    ## 📷 Camera Usage Tips
-    - Ensure camera access is allowed when prompted
-    - Click the camera button to capture the current frame
-    - The captured image will be used as input for depth estimation
-    ## 🎬 Video Processing Tips
-    - Supports multiple video formats (MP4, AVI, MOV, etc.)
-    - Video processing may take some time, please be patient
-    - Processing progress will be displayed in real-time
-    - The output video will maintain the same frame rate as the input
-    """)
-if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
     )

+import gradio as gr
+import os
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+import tempfile
+import io
+from tqdm import tqdm
+from depth_anything.dpt import DepthAnything_AC
+def normalize_depth(disparity_tensor):
+    """Standard normalization method to convert disparity to depth"""
+    eps = 1e-6
+    disparity_min = disparity_tensor.min()
+    disparity_max = disparity_tensor.max()
+    normalized_disparity = (disparity_tensor - disparity_min) / (disparity_max - disparity_min + eps)
+    return normalized_disparity
+def load_model(model_path='checkpoints/depth_anything_AC_vits.pth', encoder='vits'):
+    """Load trained depth estimation model"""
+    model_configs = {
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024], 'version': 'v2'},
+        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768], 'version': 'v2'},
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384], 'version': 'v2'}
+    }
+    model = DepthAnything_AC(model_configs[encoder])
+    if os.path.exists(model_path):
+        checkpoint = torch.load(model_path, map_location='cpu')
+        model.load_state_dict(checkpoint, strict=False)
+    else:
+        print(f"Warning: Model file {model_path} not found")
+    model.eval()
+    if torch.cuda.is_available():
+        model.cuda()
+    return model
+def preprocess_image(image, target_size=518):
+    """Preprocess input image"""
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    if len(image.shape) == 3 and image.shape[2] == 3:
+        pass
+    elif len(image.shape) == 3 and image.shape[2] == 4:
+        image = image[:, :, :3]
+    image = image.astype(np.float32) / 255.0
+    h, w = image.shape[:2]
+    scale = target_size / min(h, w)
+    new_h, new_w = int(h * scale), int(w * scale)
+    new_h = ((new_h + 13) // 14) * 14
+    new_w = ((new_w + 13) // 14) * 14
+    image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+    mean = np.array([0.485, 0.456, 0.406])
+    std = np.array([0.229, 0.224, 0.225])
+    image = (image - mean) / std
+    image = torch.from_numpy(image.transpose(2, 0, 1)).float()
+    image = image.unsqueeze(0)
+    return image, (h, w)
+def preprocess_image_from_array(image_array, target_size=518):
+    """Preprocess input image from numpy array (for video frames)"""
+    if len(image_array.shape) == 3 and image_array.shape[2] == 3:
+        # Convert BGR to RGB if needed
+        image = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
+    else:
+        image = image_array.astype(np.float32) / 255.0
+    h, w = image.shape[:2]
+    scale = target_size / min(h, w)
+    new_h, new_w = int(h * scale), int(w * scale)
+    new_h = ((new_h + 13) // 14) * 14
+    new_w = ((new_w + 13) // 14) * 14
+    image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+    mean = np.array([0.485, 0.456, 0.406])
+    std = np.array([0.229, 0.224, 0.225])
+    image = (image - mean) / std
+    image = torch.from_numpy(image.transpose(2, 0, 1)).float()
+    image = image.unsqueeze(0)
+    return image, (h, w)
+def postprocess_depth(depth_tensor, original_size):
+    """Post-process depth map"""
+    if depth_tensor.dim() == 3:
+        depth_tensor = depth_tensor.unsqueeze(1)
+    elif depth_tensor.dim() == 2:
+        depth_tensor = depth_tensor.unsqueeze(0).unsqueeze(1)
+    h, w = original_size
+    depth = F.interpolate(depth_tensor, size=(h, w), mode='bilinear', align_corners=True)
+    depth = depth.squeeze().cpu().numpy()
+    return depth
+def create_colored_depth_map(depth, colormap='spectral'):
+    """Create colored depth map"""
+    if colormap == 'inferno':
+        depth_colored = cv2.applyColorMap((depth * 255).astype(np.uint8), cv2.COLORMAP_INFERNO)
+        depth_colored = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB)
+    elif colormap == 'spectral':
+        from matplotlib import cm
+        spectral_cmap = cm.get_cmap('Spectral_r')
+        depth_colored = (spectral_cmap(depth) * 255).astype(np.uint8)
+        depth_colored = depth_colored[:, :, :3]
+    else:
+        depth_colored = (depth * 255).astype(np.uint8)
+        depth_colored = np.stack([depth_colored] * 3, axis=2)
+    return depth_colored
+def is_video_file(filepath):
+    """Check if the given file is a video file based on its extension"""
+    video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm', '.m4v']
+    _, ext = os.path.splitext(filepath.lower())
+    return ext in video_extensions
+print("Loading model...")
+model = load_model()
+print("Model loaded successfully!")
+def predict_depth(input_image, colormap_choice):
+    """Main depth prediction function for images"""
+    try:
+        image_tensor, original_size = preprocess_image(input_image)
+        if torch.cuda.is_available():
+            image_tensor = image_tensor.cuda()
+        with torch.no_grad():
+            prediction = model(image_tensor)
+            disparity_tensor = prediction['out']
+            depth_tensor = normalize_depth(disparity_tensor)
+        depth = postprocess_depth(depth_tensor, original_size)
+        depth_colored = create_colored_depth_map(depth, colormap_choice.lower())
+        return Image.fromarray(depth_colored)
+    except Exception as e:
+        print(f"Error during image inference: {str(e)}")
+        return None
+def predict_video_depth(input_video, colormap_choice, progress=gr.Progress()):
+    """Main depth prediction function for videos"""
+    if input_video is None:
+        return None
+    try:
+        print(f"Starting video processing: {input_video}")
+        # Open video file
+        cap = cv2.VideoCapture(input_video)
+        if not cap.isOpened():
+            print(f"Error: Cannot open video file: {input_video}")
+            return None
+        # Get video properties
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        input_fps = cap.get(cv2.CAP_PROP_FPS)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        print(f"Video properties: {total_frames} frames, {input_fps} FPS, {width}x{height}")
+        # Create temporary output video file
+        with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_file:
+            output_path = tmp_file.name
+        # Set video encoder
+        fourcc = cv2.VideoWriter.fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, input_fps, (width, height))
+        if not out.isOpened():
+            print(f"Error: Cannot create output video: {output_path}")
+            cap.release()
+            return None
+        frame_count = 0
+        # Process each frame
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame_count += 1
+            progress_percent = frame_count / total_frames
+            progress(progress_percent, desc=f"Processing frame {frame_count}/{total_frames}")
+            try:
+                # Preprocess current frame
+                image_tensor, original_size = preprocess_image_from_array(frame)
+                if torch.cuda.is_available():
+                    image_tensor = image_tensor.cuda()
+                # Perform depth estimation
+                with torch.no_grad():
+                    prediction = model(image_tensor)
+                    disparity_tensor = prediction['out']
+                    depth_tensor = normalize_depth(disparity_tensor)
+                # Postprocess depth map
+                depth = postprocess_depth(depth_tensor, original_size)
+                # Handle failed processing
+                if depth is None:
+                    if depth_tensor.dim() == 1:
+                        h, w = original_size
+                        expected_size = h * w
+                        if depth_tensor.shape[0] == expected_size:
+                            depth_tensor = depth_tensor.view(1, 1, h, w)
+                        else:
+                            import math
+                            side_length = int(math.sqrt(depth_tensor.shape[0]))
+                            if side_length * side_length == depth_tensor.shape[0]:
+                                depth_tensor = depth_tensor.view(1, 1, side_length, side_length)
+                    depth = postprocess_depth(depth_tensor, original_size)
+                # Generate colored depth map
+                if depth is None:
+                    print(f"Warning: Failed to process frame {frame_count}, using black frame")
+                    depth_frame = np.zeros((height, width, 3), dtype=np.uint8)
+                else:
+                    if colormap_choice.lower() == 'inferno':
+                        depth_frame = cv2.applyColorMap((depth * 255).astype(np.uint8), cv2.COLORMAP_INFERNO)
+                    elif colormap_choice.lower() == 'spectral':
+                        from matplotlib import cm
+                        spectral_cmap = cm.get_cmap('Spectral_r')
+                        depth_frame = (spectral_cmap(depth) * 255).astype(np.uint8)
+                        depth_frame = cv2.cvtColor(depth_frame, cv2.COLOR_RGBA2BGR)
+                    else:  # gray
+                        depth_frame = (depth * 255).astype(np.uint8)
+                        depth_frame = cv2.cvtColor(depth_frame, cv2.COLOR_GRAY2BGR)
+                # Write to output video
+                out.write(depth_frame)
+            except Exception as e:
+                print(f"Error processing frame {frame_count}: {str(e)}")
+                # Write black frame
+                black_frame = np.zeros((height, width, 3), dtype=np.uint8)
+                out.write(black_frame)
+        # Release resources
+        cap.release()
+        out.release()
+        print(f"Video processing completed! Output saved to: {output_path}")
+        return output_path
+    except Exception as e:
+        print(f"Error during video inference: {str(e)}")
+        return None
+with gr.Blocks(title="Depth Anything AC - Depth Estimation Demo", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🌊 Depth Anything AC - Depth Estimation Demo
+    Upload an image or video and AI will generate the corresponding depth map! Different colors in the depth map represent different distances, allowing you to see the three-dimensional structure of the scene.
+    ## How to Use
+    1. Choose image or video tab
+    2. Upload your file
+    3. Select your preferred colormap style
+    4. Click the "Generate Depth Map" button
+    5. View results and download
+    """)
+    with gr.Tabs():
+        # Image processing tab
+        with gr.TabItem("📷 Image Depth Estimation"):
+            with gr.Row():
+                with gr.Column():
+                    input_image = gr.Image(
+                        label="Upload Image",
+                        type="pil",
+                        height=400
+                    )
+                    image_colormap_choice = gr.Dropdown(
+                        choices=["Spectral", "Inferno", "Gray"],
+                        value="Spectral",
+                        label="Colormap"
+                    )
+                    image_submit_btn = gr.Button(
+                        "🎯 Generate Image Depth Map",
+                        variant="primary",
+                        size="lg"
+                    )
+                with gr.Column():
+                    output_image = gr.Image(
+                        label="Depth Map Result",
+                        type="pil",
+                        height=400
+                    )
+            gr.Examples(
+                examples=[
+                    ["toyset/1.png", "Spectral"],
+                    ["toyset/2.png", "Spectral"],
+                    ["toyset/good.png", "Spectral"],
+                ] if os.path.exists("toyset") else [],
+                inputs=[input_image, image_colormap_choice],
+                outputs=output_image,
+                fn=predict_depth,
+                cache_examples=False,
+                label="Try these example images"
+            )
+        # Video processing tab
+        with gr.TabItem("🎬 Video Depth Estimation"):
+            with gr.Row():
+                with gr.Column():
+                    input_video = gr.Video(
+                        label="Upload Video",
+                        height=400
+                    )
+                    video_colormap_choice = gr.Dropdown(
+                        choices=["Spectral", "Inferno", "Gray"],
+                        value="Spectral",
+                        label="Colormap"
+                    )
+                    video_submit_btn = gr.Button(
+                        "🎯 Generate Video Depth Map",
+                        variant="primary",
+                        size="lg"
+                    )
+                with gr.Column():
+                    output_video = gr.Video(
+                        label="Depth Map Video Result",
+                        height=400
+                    )
+            gr.Examples(
+                examples=[
+                    ["toyset/fog.mp4", "Spectral"],
+                    ["toyset/snow.mp4", "Spectral"],
+                ] if os.path.exists("toyset/fog.mp4") and os.path.exists("toyset/snow.mp4") else [],
+                inputs=[input_video, video_colormap_choice],
+                outputs=output_video,
+                fn=predict_video_depth,
+                cache_examples=False,
+                label="Try these example videos"
+            )
+    # Event bindings
+    image_submit_btn.click(
+        fn=predict_depth,
+        inputs=[input_image, image_colormap_choice],
+        outputs=output_image,
+        show_progress=True
+    )
+    video_submit_btn.click(
+        fn=predict_video_depth,
+        inputs=[input_video, video_colormap_choice],
+        outputs=output_video,
+        show_progress=True
+    )
+    gr.Markdown("""
+    ## 📝 Notes
+    - **Spectral**: Rainbow spectrum with distinct near-far contrast
+    - **Inferno**: Flame spectrum with warm tones
+    - **Gray**: Grayscale with classic effect
+    ## 💡 Tips
+    - Image processing is fast, suitable for quick preview of single images
+    - Video processing may take longer time, please be patient
+    - GPU is recommended for faster processing speed
+    """)
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
     )