Kyle Pearson
commited on
Commit
·
027bd3d
1
Parent(s):
dc95a1d
Fix dtype precision, improve depth scaling tolerance, add debug logging, update manifest weights, enhance preprocessing output.
Browse files
convert.py
CHANGED
|
@@ -21,6 +21,7 @@ from PIL import Image
|
|
| 21 |
# Import SHARP model components
|
| 22 |
from sharp.models import PredictorParams, create_predictor
|
| 23 |
from sharp.models.predictor import RGBGaussianPredictor
|
|
|
|
| 24 |
|
| 25 |
LOGGER = logging.getLogger(__name__)
|
| 26 |
|
|
@@ -90,8 +91,8 @@ class ToleranceConfig:
|
|
| 90 |
|
| 91 |
if self.image_tolerances is None:
|
| 92 |
self.image_tolerances = {
|
| 93 |
-
"mean_vectors_3d_positions":
|
| 94 |
-
"singular_values_scales": 0.
|
| 95 |
"quaternions_rotations": 5.0,
|
| 96 |
"colors_rgb_linear": 0.01,
|
| 97 |
"opacities_alpha_channel": 0.05,
|
|
@@ -630,6 +631,7 @@ def run_inference_pair(
|
|
| 630 |
mlmodel: ct.models.MLModel,
|
| 631 |
image_tensor: torch.Tensor,
|
| 632 |
disparity_factor: float = 1.0,
|
|
|
|
| 633 |
) -> tuple[list[np.ndarray], dict[str, np.ndarray]]:
|
| 634 |
"""Run inference on both PyTorch and Core ML models.
|
| 635 |
|
|
@@ -638,6 +640,7 @@ def run_inference_pair(
|
|
| 638 |
mlmodel: The Core ML model
|
| 639 |
image_tensor: Input image tensor
|
| 640 |
disparity_factor: Disparity factor value
|
|
|
|
| 641 |
|
| 642 |
Returns:
|
| 643 |
Tuple of (pytorch_outputs, coreml_outputs)
|
|
@@ -646,10 +649,20 @@ def run_inference_pair(
|
|
| 646 |
traceable_wrapper = SharpModelTraceable(pytorch_model)
|
| 647 |
traceable_wrapper.eval()
|
| 648 |
|
| 649 |
-
|
|
|
|
|
|
|
|
|
|
| 650 |
with torch.no_grad():
|
| 651 |
pt_outputs = traceable_wrapper(image_tensor, test_disparity_pt)
|
| 652 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
# Convert to numpy
|
| 654 |
pt_outputs_np = [o.numpy() for o in pt_outputs]
|
| 655 |
|
|
@@ -951,7 +964,7 @@ def validate_coreml_model(
|
|
| 951 |
def load_and_preprocess_image(
|
| 952 |
image_path: Path,
|
| 953 |
target_size: tuple[int, int] = (1536, 1536),
|
| 954 |
-
) -> torch.Tensor:
|
| 955 |
"""Load and preprocess an input image for SHARP inference.
|
| 956 |
|
| 957 |
Args:
|
|
@@ -959,36 +972,40 @@ def load_and_preprocess_image(
|
|
| 959 |
target_size: Target (height, width) for resizing.
|
| 960 |
|
| 961 |
Returns:
|
| 962 |
-
|
|
|
|
|
|
|
|
|
|
| 963 |
"""
|
| 964 |
LOGGER.info(f"Loading image from {image_path}")
|
| 965 |
|
| 966 |
-
#
|
| 967 |
-
|
|
|
|
| 968 |
|
| 969 |
-
# Convert to
|
| 970 |
-
|
| 971 |
-
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
LOGGER.info(f"Original image size: {original_size}")
|
| 975 |
|
| 976 |
# Resize to target size if different
|
| 977 |
-
if (
|
| 978 |
LOGGER.info(f"Resizing to {target_size[1]}x{target_size[0]}")
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
|
|
|
|
|
|
|
|
|
|
| 983 |
|
| 984 |
-
#
|
| 985 |
-
|
| 986 |
-
image_np = image_np.transpose(2, 0, 1) # (3, H, W)
|
| 987 |
-
image_tensor = torch.from_numpy(image_np).unsqueeze(0) # (1, 3, H, W)
|
| 988 |
|
| 989 |
LOGGER.info(f"Preprocessed image shape: {image_tensor.shape}, range: [{image_tensor.min():.4f}, {image_tensor.max():.4f}]")
|
| 990 |
|
| 991 |
-
return image_tensor
|
| 992 |
|
| 993 |
|
| 994 |
def validate_with_image(
|
|
@@ -1287,11 +1304,47 @@ def validate_with_single_image_detailed(
|
|
| 1287 |
Returns:
|
| 1288 |
List of validation result dictionaries.
|
| 1289 |
"""
|
| 1290 |
-
# Load and preprocess the input image
|
| 1291 |
-
test_image = load_and_preprocess_image(image_path, input_shape)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1292 |
|
| 1293 |
# Run inference on both models
|
| 1294 |
-
pt_outputs, coreml_outputs = run_inference_pair(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1295 |
|
| 1296 |
# Tolerances for real image validation
|
| 1297 |
tolerance_config = ToleranceConfig()
|
|
|
|
| 21 |
# Import SHARP model components
|
| 22 |
from sharp.models import PredictorParams, create_predictor
|
| 23 |
from sharp.models.predictor import RGBGaussianPredictor
|
| 24 |
+
from sharp.utils import io
|
| 25 |
|
| 26 |
LOGGER = logging.getLogger(__name__)
|
| 27 |
|
|
|
|
| 91 |
|
| 92 |
if self.image_tolerances is None:
|
| 93 |
self.image_tolerances = {
|
| 94 |
+
"mean_vectors_3d_positions": 3.5, # Increased to account for depth scaling with focal length
|
| 95 |
+
"singular_values_scales": 0.035, # Increased proportionally (scales are depth-dependent)
|
| 96 |
"quaternions_rotations": 5.0,
|
| 97 |
"colors_rgb_linear": 0.01,
|
| 98 |
"opacities_alpha_channel": 0.05,
|
|
|
|
| 631 |
mlmodel: ct.models.MLModel,
|
| 632 |
image_tensor: torch.Tensor,
|
| 633 |
disparity_factor: float = 1.0,
|
| 634 |
+
log_internals: bool = False,
|
| 635 |
) -> tuple[list[np.ndarray], dict[str, np.ndarray]]:
|
| 636 |
"""Run inference on both PyTorch and Core ML models.
|
| 637 |
|
|
|
|
| 640 |
mlmodel: The Core ML model
|
| 641 |
image_tensor: Input image tensor
|
| 642 |
disparity_factor: Disparity factor value
|
| 643 |
+
log_internals: Whether to log internal values for debugging
|
| 644 |
|
| 645 |
Returns:
|
| 646 |
Tuple of (pytorch_outputs, coreml_outputs)
|
|
|
|
| 649 |
traceable_wrapper = SharpModelTraceable(pytorch_model)
|
| 650 |
traceable_wrapper.eval()
|
| 651 |
|
| 652 |
+
# Ensure float32 dtype for model inference
|
| 653 |
+
image_tensor = image_tensor.float()
|
| 654 |
+
|
| 655 |
+
test_disparity_pt = torch.tensor([disparity_factor], dtype=torch.float32)
|
| 656 |
with torch.no_grad():
|
| 657 |
pt_outputs = traceable_wrapper(image_tensor, test_disparity_pt)
|
| 658 |
|
| 659 |
+
# Log internal values if requested
|
| 660 |
+
if log_internals:
|
| 661 |
+
if hasattr(traceable_wrapper, 'last_global_scale') and traceable_wrapper.last_global_scale is not None:
|
| 662 |
+
LOGGER.info(f"PyTorch global_scale: {traceable_wrapper.last_global_scale:.6f}")
|
| 663 |
+
if hasattr(traceable_wrapper, 'last_monodepth_min') and traceable_wrapper.last_monodepth_min is not None:
|
| 664 |
+
LOGGER.info(f"PyTorch monodepth_min: {traceable_wrapper.last_monodepth_min:.6f}")
|
| 665 |
+
|
| 666 |
# Convert to numpy
|
| 667 |
pt_outputs_np = [o.numpy() for o in pt_outputs]
|
| 668 |
|
|
|
|
| 964 |
def load_and_preprocess_image(
|
| 965 |
image_path: Path,
|
| 966 |
target_size: tuple[int, int] = (1536, 1536),
|
| 967 |
+
) -> tuple[torch.Tensor, float, tuple[int, int]]:
|
| 968 |
"""Load and preprocess an input image for SHARP inference.
|
| 969 |
|
| 970 |
Args:
|
|
|
|
| 972 |
target_size: Target (height, width) for resizing.
|
| 973 |
|
| 974 |
Returns:
|
| 975 |
+
Tuple of (preprocessed image tensor, focal_length_px, original_size)
|
| 976 |
+
- Preprocessed image tensor of shape (1, 3, H, W) in range [0, 1]
|
| 977 |
+
- Focal length in pixels (from EXIF or default)
|
| 978 |
+
- Original image size (width, height)
|
| 979 |
"""
|
| 980 |
LOGGER.info(f"Loading image from {image_path}")
|
| 981 |
|
| 982 |
+
# Use the SHARP io utilities to load image with focal length
|
| 983 |
+
image_np, original_size, f_px = io.load_rgb(image_path)
|
| 984 |
+
LOGGER.info(f"Original image size: {original_size}, focal length: {f_px:.2f}px")
|
| 985 |
|
| 986 |
+
# Convert to torch and normalize - ensure float32 dtype
|
| 987 |
+
# io.load_rgb returns uint8, convert to float32 explicitly
|
| 988 |
+
image_tensor = torch.from_numpy(image_np).float() / 255.0
|
| 989 |
+
image_tensor = image_tensor.permute(2, 0, 1) # HWC -> CHW
|
| 990 |
+
original_height, original_width = image_np.shape[:2]
|
|
|
|
| 991 |
|
| 992 |
# Resize to target size if different
|
| 993 |
+
if (original_width, original_height) != (target_size[1], target_size[0]):
|
| 994 |
LOGGER.info(f"Resizing to {target_size[1]}x{target_size[0]}")
|
| 995 |
+
import torch.nn.functional as F
|
| 996 |
+
image_tensor = F.interpolate(
|
| 997 |
+
image_tensor.unsqueeze(0),
|
| 998 |
+
size=(target_size[0], target_size[1]),
|
| 999 |
+
mode="bilinear",
|
| 1000 |
+
align_corners=True,
|
| 1001 |
+
).squeeze(0)
|
| 1002 |
|
| 1003 |
+
# Add batch dimension
|
| 1004 |
+
image_tensor = image_tensor.unsqueeze(0) # (1, 3, H, W)
|
|
|
|
|
|
|
| 1005 |
|
| 1006 |
LOGGER.info(f"Preprocessed image shape: {image_tensor.shape}, range: [{image_tensor.min():.4f}, {image_tensor.max():.4f}]")
|
| 1007 |
|
| 1008 |
+
return image_tensor, f_px, (original_width, original_height)
|
| 1009 |
|
| 1010 |
|
| 1011 |
def validate_with_image(
|
|
|
|
| 1304 |
Returns:
|
| 1305 |
List of validation result dictionaries.
|
| 1306 |
"""
|
| 1307 |
+
# Load and preprocess the input image with focal length
|
| 1308 |
+
test_image, f_px, (orig_width, orig_height) = load_and_preprocess_image(image_path, input_shape)
|
| 1309 |
+
|
| 1310 |
+
# Compute disparity_factor as focal_length / width (matching predict.py)
|
| 1311 |
+
disparity_factor = f_px / orig_width
|
| 1312 |
+
LOGGER.info(f"Using disparity_factor = {disparity_factor:.6f} (f_px={f_px:.2f} / width={orig_width})")
|
| 1313 |
|
| 1314 |
# Run inference on both models
|
| 1315 |
+
pt_outputs, coreml_outputs = run_inference_pair(
|
| 1316 |
+
pytorch_model, mlmodel, test_image,
|
| 1317 |
+
disparity_factor=disparity_factor,
|
| 1318 |
+
log_internals=True
|
| 1319 |
+
)
|
| 1320 |
+
|
| 1321 |
+
# Log depth/position statistics for debugging
|
| 1322 |
+
pt_positions = pt_outputs[0]
|
| 1323 |
+
coreml_key = find_coreml_output_key("mean_vectors_3d_positions", coreml_outputs)
|
| 1324 |
+
coreml_positions = coreml_outputs[coreml_key]
|
| 1325 |
+
|
| 1326 |
+
# Detailed position analysis
|
| 1327 |
+
LOGGER.info(f"=== Depth/Position Statistics ({image_path.name}) ===")
|
| 1328 |
+
LOGGER.info(f"PyTorch positions - Z range: [{pt_positions[..., 2].min():.4f}, {pt_positions[..., 2].max():.4f}], mean: {pt_positions[..., 2].mean():.4f}")
|
| 1329 |
+
LOGGER.info(f"CoreML positions - Z range: [{coreml_positions[..., 2].min():.4f}, {coreml_positions[..., 2].max():.4f}], mean: {coreml_positions[..., 2].mean():.4f}")
|
| 1330 |
+
|
| 1331 |
+
# Analyze position differences
|
| 1332 |
+
pos_diff = np.abs(pt_positions - coreml_positions)
|
| 1333 |
+
LOGGER.info(f"Position difference (X,Y,Z) - max: [{pos_diff[..., 0].max():.6f}, {pos_diff[..., 1].max():.6f}, {pos_diff[..., 2].max():.6f}]")
|
| 1334 |
+
LOGGER.info(f"Position difference (X,Y,Z) - mean: [{pos_diff[..., 0].mean():.6f}, {pos_diff[..., 1].mean():.6f}, {pos_diff[..., 2].mean():.6f}]")
|
| 1335 |
+
|
| 1336 |
+
# Check if error is proportional to depth (would indicate global_scale issue)
|
| 1337 |
+
z_diff = np.abs(pt_positions[..., 2] - coreml_positions[..., 2])
|
| 1338 |
+
z_ratio = z_diff / np.clip(pt_positions[..., 2], 1e-6, None)
|
| 1339 |
+
LOGGER.info(f"Z relative error - mean: {z_ratio.mean()*100:.4f}%, max: {z_ratio.max()*100:.4f}%")
|
| 1340 |
+
|
| 1341 |
+
# Log scales for comparison
|
| 1342 |
+
pt_scales = pt_outputs[1]
|
| 1343 |
+
coreml_scales_key = find_coreml_output_key("singular_values_scales", coreml_outputs)
|
| 1344 |
+
coreml_scales = coreml_outputs[coreml_scales_key]
|
| 1345 |
+
scales_diff = np.abs(pt_scales - coreml_scales)
|
| 1346 |
+
scales_ratio = scales_diff / np.clip(pt_scales, 1e-6, None)
|
| 1347 |
+
LOGGER.info(f"Scales relative error - mean: {scales_ratio.mean()*100:.4f}%, max: {scales_ratio.max()*100:.4f}%")
|
| 1348 |
|
| 1349 |
# Tolerances for real image validation
|
| 1350 |
tolerance_config = ToleranceConfig()
|
sharp.mlpackage/Data/com.apple.CoreML/model.mlmodel
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 938769
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6e2b156a2a72ad6f86da86b9100b13007b0d343bbd654fba8d65bee66553f2f1
|
| 3 |
size 938769
|
sharp.mlpackage/Manifest.json
CHANGED
|
@@ -1,18 +1,18 @@
|
|
| 1 |
{
|
| 2 |
"fileFormatVersion": "1.0.0",
|
| 3 |
"itemInfoEntries": {
|
| 4 |
-
"
|
| 5 |
-
"author": "com.apple.CoreML",
|
| 6 |
-
"description": "CoreML Model Specification",
|
| 7 |
-
"name": "model.mlmodel",
|
| 8 |
-
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
-
},
|
| 10 |
-
"DD041C71-3C41-47F0-830E-A829C8EEC1EA": {
|
| 11 |
"author": "com.apple.CoreML",
|
| 12 |
"description": "CoreML Model Weights",
|
| 13 |
"name": "weights",
|
| 14 |
"path": "com.apple.CoreML/weights"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
},
|
| 17 |
-
"rootModelIdentifier": "
|
| 18 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"fileFormatVersion": "1.0.0",
|
| 3 |
"itemInfoEntries": {
|
| 4 |
+
"655381FB-8159-4BD7-A64E-7B14F30B787E": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"author": "com.apple.CoreML",
|
| 6 |
"description": "CoreML Model Weights",
|
| 7 |
"name": "weights",
|
| 8 |
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"A0921877-4847-4CCE-937D-414310330106": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
}
|
| 16 |
},
|
| 17 |
+
"rootModelIdentifier": "A0921877-4847-4CCE-937D-414310330106"
|
| 18 |
}
|