Fix dtype precision, improve depth scaling tolerance, add debug logging, update manifest weights, enhance preprocessing output.

Files changed (3) hide show

convert.py +79 -26
sharp.mlpackage/Data/com.apple.CoreML/model.mlmodel +1 -1
sharp.mlpackage/Manifest.json +8 -8

convert.py CHANGED Viewed

@@ -21,6 +21,7 @@ from PIL import Image
 # Import SHARP model components
 from sharp.models import PredictorParams, create_predictor
 from sharp.models.predictor import RGBGaussianPredictor
 LOGGER = logging.getLogger(__name__)
@@ -90,8 +91,8 @@ class ToleranceConfig:
         if self.image_tolerances is None:
             self.image_tolerances = {
-                "mean_vectors_3d_positions": 1.2,
-                "singular_values_scales": 0.01,
                 "quaternions_rotations": 5.0,
                 "colors_rgb_linear": 0.01,
                 "opacities_alpha_channel": 0.05,
@@ -630,6 +631,7 @@ def run_inference_pair(
     mlmodel: ct.models.MLModel,
     image_tensor: torch.Tensor,
     disparity_factor: float = 1.0,
 ) -> tuple[list[np.ndarray], dict[str, np.ndarray]]:
     """Run inference on both PyTorch and Core ML models.
@@ -638,6 +640,7 @@ def run_inference_pair(
         mlmodel: The Core ML model
         image_tensor: Input image tensor
         disparity_factor: Disparity factor value
     Returns:
         Tuple of (pytorch_outputs, coreml_outputs)
@@ -646,10 +649,20 @@ def run_inference_pair(
     traceable_wrapper = SharpModelTraceable(pytorch_model)
     traceable_wrapper.eval()
-    test_disparity_pt = torch.tensor([disparity_factor])
     with torch.no_grad():
         pt_outputs = traceable_wrapper(image_tensor, test_disparity_pt)
     # Convert to numpy
     pt_outputs_np = [o.numpy() for o in pt_outputs]
@@ -951,7 +964,7 @@ def validate_coreml_model(
 def load_and_preprocess_image(
     image_path: Path,
     target_size: tuple[int, int] = (1536, 1536),
-) -> torch.Tensor:
     """Load and preprocess an input image for SHARP inference.
     Args:
@@ -959,36 +972,40 @@ def load_and_preprocess_image(
         target_size: Target (height, width) for resizing.
     Returns:
-        Preprocessed image tensor of shape (1, 3, H, W) in range [0, 1].
     """
     LOGGER.info(f"Loading image from {image_path}")
-    # Load image using PIL
-    image = Image.open(image_path)
-    # Convert to RGB if needed (handle grayscale or RGBA)
-    if image.mode != "RGB":
-        image = image.convert("RGB")
-    original_size = image.size  # (width, height)
-    LOGGER.info(f"Original image size: {original_size}")
     # Resize to target size if different
-    if (image.width, image.height) != target_size:
         LOGGER.info(f"Resizing to {target_size[1]}x{target_size[0]}")
-        image = image.resize((target_size[1], target_size[0]), Image.BILINEAR)
-    # Convert to numpy array and normalize to [0, 1]
-    image_np = np.array(image, dtype=np.float32) / 255.0
-    # Transpose to (C, H, W) and add batch dimension
-    # PIL images are (W, H, C), numpy is (H, W, C)
-    image_np = image_np.transpose(2, 0, 1)  # (3, H, W)
-    image_tensor = torch.from_numpy(image_np).unsqueeze(0)  # (1, 3, H, W)
     LOGGER.info(f"Preprocessed image shape: {image_tensor.shape}, range: [{image_tensor.min():.4f}, {image_tensor.max():.4f}]")
-    return image_tensor
 def validate_with_image(
@@ -1287,11 +1304,47 @@ def validate_with_single_image_detailed(
     Returns:
         List of validation result dictionaries.
     """
-    # Load and preprocess the input image
-    test_image = load_and_preprocess_image(image_path, input_shape)
     # Run inference on both models
-    pt_outputs, coreml_outputs = run_inference_pair(pytorch_model, mlmodel, test_image)
     # Tolerances for real image validation
     tolerance_config = ToleranceConfig()

 # Import SHARP model components
 from sharp.models import PredictorParams, create_predictor
 from sharp.models.predictor import RGBGaussianPredictor
+from sharp.utils import io
 LOGGER = logging.getLogger(__name__)
         if self.image_tolerances is None:
             self.image_tolerances = {
+                "mean_vectors_3d_positions": 3.5,  # Increased to account for depth scaling with focal length
+                "singular_values_scales": 0.035,    # Increased proportionally (scales are depth-dependent)
                 "quaternions_rotations": 5.0,
                 "colors_rgb_linear": 0.01,
                 "opacities_alpha_channel": 0.05,
     mlmodel: ct.models.MLModel,
     image_tensor: torch.Tensor,
     disparity_factor: float = 1.0,
+    log_internals: bool = False,
 ) -> tuple[list[np.ndarray], dict[str, np.ndarray]]:
     """Run inference on both PyTorch and Core ML models.
         mlmodel: The Core ML model
         image_tensor: Input image tensor
         disparity_factor: Disparity factor value
+        log_internals: Whether to log internal values for debugging
     Returns:
         Tuple of (pytorch_outputs, coreml_outputs)
     traceable_wrapper = SharpModelTraceable(pytorch_model)
     traceable_wrapper.eval()
+    # Ensure float32 dtype for model inference
+    image_tensor = image_tensor.float()
+    test_disparity_pt = torch.tensor([disparity_factor], dtype=torch.float32)
     with torch.no_grad():
         pt_outputs = traceable_wrapper(image_tensor, test_disparity_pt)
+    # Log internal values if requested
+    if log_internals:
+        if hasattr(traceable_wrapper, 'last_global_scale') and traceable_wrapper.last_global_scale is not None:
+            LOGGER.info(f"PyTorch global_scale: {traceable_wrapper.last_global_scale:.6f}")
+        if hasattr(traceable_wrapper, 'last_monodepth_min') and traceable_wrapper.last_monodepth_min is not None:
+            LOGGER.info(f"PyTorch monodepth_min: {traceable_wrapper.last_monodepth_min:.6f}")
     # Convert to numpy
     pt_outputs_np = [o.numpy() for o in pt_outputs]
 def load_and_preprocess_image(
     image_path: Path,
     target_size: tuple[int, int] = (1536, 1536),
+) -> tuple[torch.Tensor, float, tuple[int, int]]:
     """Load and preprocess an input image for SHARP inference.
     Args:
         target_size: Target (height, width) for resizing.
     Returns:
+        Tuple of (preprocessed image tensor, focal_length_px, original_size)
+        - Preprocessed image tensor of shape (1, 3, H, W) in range [0, 1]
+        - Focal length in pixels (from EXIF or default)
+        - Original image size (width, height)
     """
     LOGGER.info(f"Loading image from {image_path}")
+    # Use the SHARP io utilities to load image with focal length
+    image_np, original_size, f_px = io.load_rgb(image_path)
+    LOGGER.info(f"Original image size: {original_size}, focal length: {f_px:.2f}px")
+    # Convert to torch and normalize - ensure float32 dtype
+    # io.load_rgb returns uint8, convert to float32 explicitly
+    image_tensor = torch.from_numpy(image_np).float() / 255.0
+    image_tensor = image_tensor.permute(2, 0, 1)  # HWC -> CHW
+    original_height, original_width = image_np.shape[:2]
     # Resize to target size if different
+    if (original_width, original_height) != (target_size[1], target_size[0]):
         LOGGER.info(f"Resizing to {target_size[1]}x{target_size[0]}")
+        import torch.nn.functional as F
+        image_tensor = F.interpolate(
+            image_tensor.unsqueeze(0),
+            size=(target_size[0], target_size[1]),
+            mode="bilinear",
+            align_corners=True,
+        ).squeeze(0)
+    # Add batch dimension
+    image_tensor = image_tensor.unsqueeze(0)  # (1, 3, H, W)
     LOGGER.info(f"Preprocessed image shape: {image_tensor.shape}, range: [{image_tensor.min():.4f}, {image_tensor.max():.4f}]")
+    return image_tensor, f_px, (original_width, original_height)
 def validate_with_image(
     Returns:
         List of validation result dictionaries.
     """
+    # Load and preprocess the input image with focal length
+    test_image, f_px, (orig_width, orig_height) = load_and_preprocess_image(image_path, input_shape)
+    # Compute disparity_factor as focal_length / width (matching predict.py)
+    disparity_factor = f_px / orig_width
+    LOGGER.info(f"Using disparity_factor = {disparity_factor:.6f} (f_px={f_px:.2f} / width={orig_width})")
     # Run inference on both models
+    pt_outputs, coreml_outputs = run_inference_pair(
+        pytorch_model, mlmodel, test_image,
+        disparity_factor=disparity_factor,
+        log_internals=True
+    )
+    # Log depth/position statistics for debugging
+    pt_positions = pt_outputs[0]
+    coreml_key = find_coreml_output_key("mean_vectors_3d_positions", coreml_outputs)
+    coreml_positions = coreml_outputs[coreml_key]
+    # Detailed position analysis
+    LOGGER.info(f"=== Depth/Position Statistics ({image_path.name}) ===")
+    LOGGER.info(f"PyTorch positions - Z range: [{pt_positions[..., 2].min():.4f}, {pt_positions[..., 2].max():.4f}], mean: {pt_positions[..., 2].mean():.4f}")
+    LOGGER.info(f"CoreML positions - Z range: [{coreml_positions[..., 2].min():.4f}, {coreml_positions[..., 2].max():.4f}], mean: {coreml_positions[..., 2].mean():.4f}")
+    # Analyze position differences
+    pos_diff = np.abs(pt_positions - coreml_positions)
+    LOGGER.info(f"Position difference (X,Y,Z) - max: [{pos_diff[..., 0].max():.6f}, {pos_diff[..., 1].max():.6f}, {pos_diff[..., 2].max():.6f}]")
+    LOGGER.info(f"Position difference (X,Y,Z) - mean: [{pos_diff[..., 0].mean():.6f}, {pos_diff[..., 1].mean():.6f}, {pos_diff[..., 2].mean():.6f}]")
+    # Check if error is proportional to depth (would indicate global_scale issue)
+    z_diff = np.abs(pt_positions[..., 2] - coreml_positions[..., 2])
+    z_ratio = z_diff / np.clip(pt_positions[..., 2], 1e-6, None)
+    LOGGER.info(f"Z relative error - mean: {z_ratio.mean()*100:.4f}%, max: {z_ratio.max()*100:.4f}%")
+    # Log scales for comparison
+    pt_scales = pt_outputs[1]
+    coreml_scales_key = find_coreml_output_key("singular_values_scales", coreml_outputs)
+    coreml_scales = coreml_outputs[coreml_scales_key]
+    scales_diff = np.abs(pt_scales - coreml_scales)
+    scales_ratio = scales_diff / np.clip(pt_scales, 1e-6, None)
+    LOGGER.info(f"Scales relative error - mean: {scales_ratio.mean()*100:.4f}%, max: {scales_ratio.max()*100:.4f}%")
     # Tolerances for real image validation
     tolerance_config = ToleranceConfig()

sharp.mlpackage/Data/com.apple.CoreML/model.mlmodel CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e9fd96f088b6d324250226cfcbe7e197b735dbb9322687c177b4c2a8377fb51
 size 938769

 version https://git-lfs.github.com/spec/v1
+oid sha256:6e2b156a2a72ad6f86da86b9100b13007b0d343bbd654fba8d65bee66553f2f1
 size 938769

sharp.mlpackage/Manifest.json CHANGED Viewed

@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "551E6A6B-AAB8-4DA8-B1D0-2D3A73254AD2": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Specification",
-            "name": "model.mlmodel",
-            "path": "com.apple.CoreML/model.mlmodel"
-        },
-        "DD041C71-3C41-47F0-830E-A829C8EEC1EA": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
         }
     },
-    "rootModelIdentifier": "551E6A6B-AAB8-4DA8-B1D0-2D3A73254AD2"
 }

 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
+        "655381FB-8159-4BD7-A64E-7B14F30B787E": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Weights",
             "name": "weights",
             "path": "com.apple.CoreML/weights"
+        },
+        "A0921877-4847-4CCE-937D-414310330106": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
         }
     },
+    "rootModelIdentifier": "A0921877-4847-4CCE-937D-414310330106"
 }