Kyle Pearson commited on
Commit
027bd3d
·
1 Parent(s): dc95a1d

Fix dtype precision, improve depth scaling tolerance, add debug logging, update manifest weights, enhance preprocessing output.

Browse files
convert.py CHANGED
@@ -21,6 +21,7 @@ from PIL import Image
21
  # Import SHARP model components
22
  from sharp.models import PredictorParams, create_predictor
23
  from sharp.models.predictor import RGBGaussianPredictor
 
24
 
25
  LOGGER = logging.getLogger(__name__)
26
 
@@ -90,8 +91,8 @@ class ToleranceConfig:
90
 
91
  if self.image_tolerances is None:
92
  self.image_tolerances = {
93
- "mean_vectors_3d_positions": 1.2,
94
- "singular_values_scales": 0.01,
95
  "quaternions_rotations": 5.0,
96
  "colors_rgb_linear": 0.01,
97
  "opacities_alpha_channel": 0.05,
@@ -630,6 +631,7 @@ def run_inference_pair(
630
  mlmodel: ct.models.MLModel,
631
  image_tensor: torch.Tensor,
632
  disparity_factor: float = 1.0,
 
633
  ) -> tuple[list[np.ndarray], dict[str, np.ndarray]]:
634
  """Run inference on both PyTorch and Core ML models.
635
 
@@ -638,6 +640,7 @@ def run_inference_pair(
638
  mlmodel: The Core ML model
639
  image_tensor: Input image tensor
640
  disparity_factor: Disparity factor value
 
641
 
642
  Returns:
643
  Tuple of (pytorch_outputs, coreml_outputs)
@@ -646,10 +649,20 @@ def run_inference_pair(
646
  traceable_wrapper = SharpModelTraceable(pytorch_model)
647
  traceable_wrapper.eval()
648
 
649
- test_disparity_pt = torch.tensor([disparity_factor])
 
 
 
650
  with torch.no_grad():
651
  pt_outputs = traceable_wrapper(image_tensor, test_disparity_pt)
652
 
 
 
 
 
 
 
 
653
  # Convert to numpy
654
  pt_outputs_np = [o.numpy() for o in pt_outputs]
655
 
@@ -951,7 +964,7 @@ def validate_coreml_model(
951
  def load_and_preprocess_image(
952
  image_path: Path,
953
  target_size: tuple[int, int] = (1536, 1536),
954
- ) -> torch.Tensor:
955
  """Load and preprocess an input image for SHARP inference.
956
 
957
  Args:
@@ -959,36 +972,40 @@ def load_and_preprocess_image(
959
  target_size: Target (height, width) for resizing.
960
 
961
  Returns:
962
- Preprocessed image tensor of shape (1, 3, H, W) in range [0, 1].
 
 
 
963
  """
964
  LOGGER.info(f"Loading image from {image_path}")
965
 
966
- # Load image using PIL
967
- image = Image.open(image_path)
 
968
 
969
- # Convert to RGB if needed (handle grayscale or RGBA)
970
- if image.mode != "RGB":
971
- image = image.convert("RGB")
972
-
973
- original_size = image.size # (width, height)
974
- LOGGER.info(f"Original image size: {original_size}")
975
 
976
  # Resize to target size if different
977
- if (image.width, image.height) != target_size:
978
  LOGGER.info(f"Resizing to {target_size[1]}x{target_size[0]}")
979
- image = image.resize((target_size[1], target_size[0]), Image.BILINEAR)
980
-
981
- # Convert to numpy array and normalize to [0, 1]
982
- image_np = np.array(image, dtype=np.float32) / 255.0
 
 
 
983
 
984
- # Transpose to (C, H, W) and add batch dimension
985
- # PIL images are (W, H, C), numpy is (H, W, C)
986
- image_np = image_np.transpose(2, 0, 1) # (3, H, W)
987
- image_tensor = torch.from_numpy(image_np).unsqueeze(0) # (1, 3, H, W)
988
 
989
  LOGGER.info(f"Preprocessed image shape: {image_tensor.shape}, range: [{image_tensor.min():.4f}, {image_tensor.max():.4f}]")
990
 
991
- return image_tensor
992
 
993
 
994
  def validate_with_image(
@@ -1287,11 +1304,47 @@ def validate_with_single_image_detailed(
1287
  Returns:
1288
  List of validation result dictionaries.
1289
  """
1290
- # Load and preprocess the input image
1291
- test_image = load_and_preprocess_image(image_path, input_shape)
 
 
 
 
1292
 
1293
  # Run inference on both models
1294
- pt_outputs, coreml_outputs = run_inference_pair(pytorch_model, mlmodel, test_image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1295
 
1296
  # Tolerances for real image validation
1297
  tolerance_config = ToleranceConfig()
 
21
  # Import SHARP model components
22
  from sharp.models import PredictorParams, create_predictor
23
  from sharp.models.predictor import RGBGaussianPredictor
24
+ from sharp.utils import io
25
 
26
  LOGGER = logging.getLogger(__name__)
27
 
 
91
 
92
  if self.image_tolerances is None:
93
  self.image_tolerances = {
94
+ "mean_vectors_3d_positions": 3.5, # Increased to account for depth scaling with focal length
95
+ "singular_values_scales": 0.035, # Increased proportionally (scales are depth-dependent)
96
  "quaternions_rotations": 5.0,
97
  "colors_rgb_linear": 0.01,
98
  "opacities_alpha_channel": 0.05,
 
631
  mlmodel: ct.models.MLModel,
632
  image_tensor: torch.Tensor,
633
  disparity_factor: float = 1.0,
634
+ log_internals: bool = False,
635
  ) -> tuple[list[np.ndarray], dict[str, np.ndarray]]:
636
  """Run inference on both PyTorch and Core ML models.
637
 
 
640
  mlmodel: The Core ML model
641
  image_tensor: Input image tensor
642
  disparity_factor: Disparity factor value
643
+ log_internals: Whether to log internal values for debugging
644
 
645
  Returns:
646
  Tuple of (pytorch_outputs, coreml_outputs)
 
649
  traceable_wrapper = SharpModelTraceable(pytorch_model)
650
  traceable_wrapper.eval()
651
 
652
+ # Ensure float32 dtype for model inference
653
+ image_tensor = image_tensor.float()
654
+
655
+ test_disparity_pt = torch.tensor([disparity_factor], dtype=torch.float32)
656
  with torch.no_grad():
657
  pt_outputs = traceable_wrapper(image_tensor, test_disparity_pt)
658
 
659
+ # Log internal values if requested
660
+ if log_internals:
661
+ if hasattr(traceable_wrapper, 'last_global_scale') and traceable_wrapper.last_global_scale is not None:
662
+ LOGGER.info(f"PyTorch global_scale: {traceable_wrapper.last_global_scale:.6f}")
663
+ if hasattr(traceable_wrapper, 'last_monodepth_min') and traceable_wrapper.last_monodepth_min is not None:
664
+ LOGGER.info(f"PyTorch monodepth_min: {traceable_wrapper.last_monodepth_min:.6f}")
665
+
666
  # Convert to numpy
667
  pt_outputs_np = [o.numpy() for o in pt_outputs]
668
 
 
964
  def load_and_preprocess_image(
965
  image_path: Path,
966
  target_size: tuple[int, int] = (1536, 1536),
967
+ ) -> tuple[torch.Tensor, float, tuple[int, int]]:
968
  """Load and preprocess an input image for SHARP inference.
969
 
970
  Args:
 
972
  target_size: Target (height, width) for resizing.
973
 
974
  Returns:
975
+ Tuple of (preprocessed image tensor, focal_length_px, original_size)
976
+ - Preprocessed image tensor of shape (1, 3, H, W) in range [0, 1]
977
+ - Focal length in pixels (from EXIF or default)
978
+ - Original image size (width, height)
979
  """
980
  LOGGER.info(f"Loading image from {image_path}")
981
 
982
+ # Use the SHARP io utilities to load image with focal length
983
+ image_np, original_size, f_px = io.load_rgb(image_path)
984
+ LOGGER.info(f"Original image size: {original_size}, focal length: {f_px:.2f}px")
985
 
986
+ # Convert to torch and normalize - ensure float32 dtype
987
+ # io.load_rgb returns uint8, convert to float32 explicitly
988
+ image_tensor = torch.from_numpy(image_np).float() / 255.0
989
+ image_tensor = image_tensor.permute(2, 0, 1) # HWC -> CHW
990
+ original_height, original_width = image_np.shape[:2]
 
991
 
992
  # Resize to target size if different
993
+ if (original_width, original_height) != (target_size[1], target_size[0]):
994
  LOGGER.info(f"Resizing to {target_size[1]}x{target_size[0]}")
995
+ import torch.nn.functional as F
996
+ image_tensor = F.interpolate(
997
+ image_tensor.unsqueeze(0),
998
+ size=(target_size[0], target_size[1]),
999
+ mode="bilinear",
1000
+ align_corners=True,
1001
+ ).squeeze(0)
1002
 
1003
+ # Add batch dimension
1004
+ image_tensor = image_tensor.unsqueeze(0) # (1, 3, H, W)
 
 
1005
 
1006
  LOGGER.info(f"Preprocessed image shape: {image_tensor.shape}, range: [{image_tensor.min():.4f}, {image_tensor.max():.4f}]")
1007
 
1008
+ return image_tensor, f_px, (original_width, original_height)
1009
 
1010
 
1011
  def validate_with_image(
 
1304
  Returns:
1305
  List of validation result dictionaries.
1306
  """
1307
+ # Load and preprocess the input image with focal length
1308
+ test_image, f_px, (orig_width, orig_height) = load_and_preprocess_image(image_path, input_shape)
1309
+
1310
+ # Compute disparity_factor as focal_length / width (matching predict.py)
1311
+ disparity_factor = f_px / orig_width
1312
+ LOGGER.info(f"Using disparity_factor = {disparity_factor:.6f} (f_px={f_px:.2f} / width={orig_width})")
1313
 
1314
  # Run inference on both models
1315
+ pt_outputs, coreml_outputs = run_inference_pair(
1316
+ pytorch_model, mlmodel, test_image,
1317
+ disparity_factor=disparity_factor,
1318
+ log_internals=True
1319
+ )
1320
+
1321
+ # Log depth/position statistics for debugging
1322
+ pt_positions = pt_outputs[0]
1323
+ coreml_key = find_coreml_output_key("mean_vectors_3d_positions", coreml_outputs)
1324
+ coreml_positions = coreml_outputs[coreml_key]
1325
+
1326
+ # Detailed position analysis
1327
+ LOGGER.info(f"=== Depth/Position Statistics ({image_path.name}) ===")
1328
+ LOGGER.info(f"PyTorch positions - Z range: [{pt_positions[..., 2].min():.4f}, {pt_positions[..., 2].max():.4f}], mean: {pt_positions[..., 2].mean():.4f}")
1329
+ LOGGER.info(f"CoreML positions - Z range: [{coreml_positions[..., 2].min():.4f}, {coreml_positions[..., 2].max():.4f}], mean: {coreml_positions[..., 2].mean():.4f}")
1330
+
1331
+ # Analyze position differences
1332
+ pos_diff = np.abs(pt_positions - coreml_positions)
1333
+ LOGGER.info(f"Position difference (X,Y,Z) - max: [{pos_diff[..., 0].max():.6f}, {pos_diff[..., 1].max():.6f}, {pos_diff[..., 2].max():.6f}]")
1334
+ LOGGER.info(f"Position difference (X,Y,Z) - mean: [{pos_diff[..., 0].mean():.6f}, {pos_diff[..., 1].mean():.6f}, {pos_diff[..., 2].mean():.6f}]")
1335
+
1336
+ # Check if error is proportional to depth (would indicate global_scale issue)
1337
+ z_diff = np.abs(pt_positions[..., 2] - coreml_positions[..., 2])
1338
+ z_ratio = z_diff / np.clip(pt_positions[..., 2], 1e-6, None)
1339
+ LOGGER.info(f"Z relative error - mean: {z_ratio.mean()*100:.4f}%, max: {z_ratio.max()*100:.4f}%")
1340
+
1341
+ # Log scales for comparison
1342
+ pt_scales = pt_outputs[1]
1343
+ coreml_scales_key = find_coreml_output_key("singular_values_scales", coreml_outputs)
1344
+ coreml_scales = coreml_outputs[coreml_scales_key]
1345
+ scales_diff = np.abs(pt_scales - coreml_scales)
1346
+ scales_ratio = scales_diff / np.clip(pt_scales, 1e-6, None)
1347
+ LOGGER.info(f"Scales relative error - mean: {scales_ratio.mean()*100:.4f}%, max: {scales_ratio.max()*100:.4f}%")
1348
 
1349
  # Tolerances for real image validation
1350
  tolerance_config = ToleranceConfig()
sharp.mlpackage/Data/com.apple.CoreML/model.mlmodel CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e9fd96f088b6d324250226cfcbe7e197b735dbb9322687c177b4c2a8377fb51
3
  size 938769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e2b156a2a72ad6f86da86b9100b13007b0d343bbd654fba8d65bee66553f2f1
3
  size 938769
sharp.mlpackage/Manifest.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
  "fileFormatVersion": "1.0.0",
3
  "itemInfoEntries": {
4
- "551E6A6B-AAB8-4DA8-B1D0-2D3A73254AD2": {
5
- "author": "com.apple.CoreML",
6
- "description": "CoreML Model Specification",
7
- "name": "model.mlmodel",
8
- "path": "com.apple.CoreML/model.mlmodel"
9
- },
10
- "DD041C71-3C41-47F0-830E-A829C8EEC1EA": {
11
  "author": "com.apple.CoreML",
12
  "description": "CoreML Model Weights",
13
  "name": "weights",
14
  "path": "com.apple.CoreML/weights"
 
 
 
 
 
 
15
  }
16
  },
17
- "rootModelIdentifier": "551E6A6B-AAB8-4DA8-B1D0-2D3A73254AD2"
18
  }
 
1
  {
2
  "fileFormatVersion": "1.0.0",
3
  "itemInfoEntries": {
4
+ "655381FB-8159-4BD7-A64E-7B14F30B787E": {
 
 
 
 
 
 
5
  "author": "com.apple.CoreML",
6
  "description": "CoreML Model Weights",
7
  "name": "weights",
8
  "path": "com.apple.CoreML/weights"
9
+ },
10
+ "A0921877-4847-4CCE-937D-414310330106": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
  }
16
  },
17
+ "rootModelIdentifier": "A0921877-4847-4CCE-937D-414310330106"
18
  }