Spaces:

k-l-lambda
/

starry

Sleeping

App Files Files Community

k-l-lambda commited on Feb 20

Commit

1958836

1 Parent(s): 6a8cad3

update: export from starry-refactor 2026-02-20 15:25

Browse files

Files changed (3) hide show

backend/python-services/predictors/unet.py +126 -0
backend/python-services/services/gauge_service.py +87 -5
backend/python-services/services/mask_service.py +71 -5

backend/python-services/predictors/unet.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+UNet model implementation.
+Matches the architecture from deep-starry/starry/unet/ for loading .chkpt checkpoints.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DoubleConv(nn.Module):
+	"""(convolution => [BN] => ReLU) * 2"""
+	def __init__(self, in_channels, out_channels, mid_channels=None):
+		super().__init__()
+		if not mid_channels:
+			mid_channels = out_channels
+		self.double_conv = nn.Sequential(
+			nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1),
+			nn.BatchNorm2d(mid_channels),
+			nn.ReLU(inplace=True),
+			nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1),
+			nn.BatchNorm2d(out_channels),
+			nn.ReLU(inplace=True),
+		)
+	def forward(self, x):
+		return self.double_conv(x)
+class Down(nn.Module):
+	"""Downscaling with maxpool then double conv"""
+	def __init__(self, in_channels, out_channels):
+		super().__init__()
+		self.maxpool_conv = nn.Sequential(
+			nn.MaxPool2d(2),
+			DoubleConv(in_channels, out_channels)
+		)
+	def forward(self, x):
+		return self.maxpool_conv(x)
+class Up(nn.Module):
+	"""Upscaling then double conv"""
+	def __init__(self, in_channels, out_channels, bilinear=True):
+		super().__init__()
+		if bilinear:
+			self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+			self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
+		else:
+			self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
+			self.conv = DoubleConv(in_channels, out_channels)
+	def forward(self, x1, x2):
+		x1 = self.up(x1)
+		diffY = x2.size()[2] - x1.size()[2]
+		diffX = x2.size()[3] - x1.size()[3]
+		x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
+						diffY // 2, diffY - diffY // 2])
+		x = torch.cat([x2, x1], dim=1)
+		return self.conv(x)
+class OutConv(nn.Module):
+	def __init__(self, in_channels, out_channels):
+		super().__init__()
+		self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)
+	def forward(self, x):
+		return self.conv(x)
+class UNet(nn.Module):
+	def __init__(self, n_channels, n_classes, classify_out=True, bilinear=True, depth=4, init_width=64):
+		super().__init__()
+		self.n_channels = n_channels
+		self.n_classes = n_classes
+		self.classify_out = classify_out
+		self.depth = depth
+		factor = 2 if bilinear else 1
+		self.inc = DoubleConv(n_channels, init_width)
+		self.outc = OutConv(init_width, n_classes)
+		downs = []
+		ups = []
+		for d in range(depth):
+			ic = init_width * (2 ** d)
+			oc = ic * 2
+			if d == depth - 1:
+				oc //= factor
+			downs.append(Down(ic, oc))
+		for d in range(depth):
+			ic = init_width * (2 ** (depth - d))
+			oc = ic // 2
+			if d < depth - 1:
+				oc //= factor
+			ups.append(Up(ic, oc, bilinear))
+		self.downs = nn.ModuleList(modules=downs)
+		self.ups = nn.ModuleList(modules=ups)
+	def forward(self, input):
+		xs = []
+		x = self.inc(input)
+		for down in self.downs:
+			xs.append(x)
+			x = down(x)
+		xs.reverse()
+		for i, up in enumerate(self.ups):
+			xi = xs[i]
+			x = up(x, xi)
+		if not self.classify_out:
+			return x
+		logits = self.outc(x)
+		return logits

backend/python-services/services/gauge_service.py CHANGED Viewed

@@ -1,13 +1,19 @@
 """
 Gauge prediction service.
 Predicts staff gauge (height and slope) map.
 """
 import numpy as np
 import torch
 import PIL.Image
-from predictors.torchscript_predictor import TorchScriptPredictor
 from common.image_utils import (
 	array_from_image_stream, slice_feature, splice_output_tensor,
 	gauge_to_rgb, encode_image_base64, encode_image_bytes,
@@ -16,6 +22,80 @@ from common.image_utils import (
 from common.transform import Composer
 class StaffGauge:
 	"""Staff gauge representation."""
@@ -32,14 +112,15 @@ class StaffGauge:
 		}
-class GaugeService(TorchScriptPredictor):
-	"""Gauge prediction service using TorchScript model."""
 	DEFAULT_TRANS = ['Mono', 'HWC2CHW']
 	DEFAULT_SLICING_WIDTH = 512
 	def __init__(self, model_path, device='cuda', trans=None, slicing_width=None):
-		super().__init__(model_path, device)
 		self.composer = Composer(trans or self.DEFAULT_TRANS)
 		self.slicing_width = slicing_width or self.DEFAULT_SLICING_WIDTH
@@ -70,7 +151,8 @@ class GaugeService(TorchScriptPredictor):
 			batch = torch.from_numpy(staves).to(self.device)
 			# Inference
-			output = self.run_inference(batch)  # (batch, channel, height, width)
 			# Splice output
 			hotmap = splice_output_tensor(output, soft=True)  # (channel, height, width)

 """
 Gauge prediction service.
 Predicts staff gauge (height and slope) map.
+Supports both TorchScript (.pt) and state_dict (.chkpt) model formats.
 """
+import os
+import logging
+from collections import OrderedDict
 import numpy as np
 import torch
+import yaml
 import PIL.Image
+from predictors.torchscript_predictor import resolve_model_path
+from predictors.unet import UNet
 from common.image_utils import (
 	array_from_image_stream, slice_feature, splice_output_tensor,
 	gauge_to_rgb, encode_image_base64, encode_image_bytes,
 from common.transform import Composer
+class _ScoreRegression(torch.nn.Module):
+	"""ScoreRegression architecture for loading .chkpt checkpoints."""
+	def __init__(self, in_channels=1, out_channels=2, unet_depth=6, unet_init_width=32):
+		super().__init__()
+		self.backbone = UNet(in_channels, out_channels, depth=unet_depth, init_width=unet_init_width)
+	def forward(self, input):
+		return self.backbone(input)
+def _load_gauge_model(model_path, device):
+	"""Load gauge model, handling both TorchScript and state_dict formats."""
+	resolved = resolve_model_path(model_path)
+	# Try TorchScript first
+	try:
+		model = torch.jit.load(resolved, map_location=device)
+		model.eval()
+		logging.info('GaugeService: TorchScript model loaded: %s', resolved)
+		return model
+	except Exception as e:
+		logging.info('GaugeService: not TorchScript (%s), trying state_dict...', str(e)[:60])
+	# Read model config from .state.yaml
+	model_dir = os.path.dirname(resolved)
+	state_file = os.path.join(model_dir, '.state.yaml')
+	unet_depth = 6
+	unet_init_width = 32
+	out_channels = 2
+	if os.path.exists(state_file):
+		with open(state_file, 'r') as f:
+			state = yaml.safe_load(f)
+		model_args = state.get('model', {}).get('args', {})
+		backbone = model_args.get('backbone', {})
+		unet_depth = backbone.get('unet_depth', 6)
+		unet_init_width = backbone.get('unet_init_width', 32)
+		out_channels = model_args.get('out_channels', 2)
+	model = _ScoreRegression(out_channels=out_channels, unet_depth=unet_depth, unet_init_width=unet_init_width)
+	checkpoint = torch.load(resolved, map_location=device, weights_only=False)
+	# Handle different checkpoint formats
+	state_dict = checkpoint
+	if isinstance(checkpoint, dict):
+		if 'model' in checkpoint:
+			state_dict = checkpoint['model']
+	# Strip common prefixes from training wrapper (ScoreRegressionLoss.deducer.*)
+	if isinstance(state_dict, dict):
+		cleaned = OrderedDict()
+		for key, value in state_dict.items():
+			new_key = key
+			if new_key.startswith('deducer.'):
+				new_key = new_key[len('deducer.'):]
+			cleaned[new_key] = value
+		# Remove non-model keys (e.g. channel_weights from Loss wrapper)
+		cleaned = OrderedDict((k, v) for k, v in cleaned.items()
+							  if k.startswith('backbone.'))
+		state_dict = cleaned
+	model.load_state_dict(state_dict, strict=False)
+	model.eval()
+	model.to(device)
+	# Log key loading stats
+	model_keys = set(model.state_dict().keys())
+	loaded_keys = set(state_dict.keys())
+	matched = model_keys & loaded_keys
+	logging.info('GaugeService: state_dict loaded: %s (%d/%d keys matched, depth=%d, width=%d)',
+				 resolved, len(matched), len(model_keys), unet_depth, unet_init_width)
+	return model
 class StaffGauge:
 	"""Staff gauge representation."""
 		}
+class GaugeService:
+	"""Gauge prediction service. Supports TorchScript and state_dict formats."""
 	DEFAULT_TRANS = ['Mono', 'HWC2CHW']
 	DEFAULT_SLICING_WIDTH = 512
 	def __init__(self, model_path, device='cuda', trans=None, slicing_width=None):
+		self.device = device
+		self.model = _load_gauge_model(model_path, device)
 		self.composer = Composer(trans or self.DEFAULT_TRANS)
 		self.slicing_width = slicing_width or self.DEFAULT_SLICING_WIDTH
 			batch = torch.from_numpy(staves).to(self.device)
 			# Inference
+			with torch.no_grad():
+				output = self.model(batch)  # (batch, channel, height, width)
 			# Splice output
 			hotmap = splice_output_tensor(output, soft=True)  # (channel, height, width)

backend/python-services/services/mask_service.py CHANGED Viewed

@@ -1,13 +1,18 @@
 """
 Mask prediction service.
 Generates staff foreground/background mask.
 """
 import numpy as np
 import torch
 import PIL.Image
-from predictors.torchscript_predictor import TorchScriptPredictor
 from common.image_utils import (
 	array_from_image_stream, slice_feature, splice_output_tensor,
 	mask_to_alpha, encode_image_base64, encode_image_bytes,
@@ -16,6 +21,65 @@ from common.image_utils import (
 from common.transform import Composer
 class StaffMask:
 	"""Staff mask representation."""
@@ -32,14 +96,15 @@ class StaffMask:
 		}
-class MaskService(TorchScriptPredictor):
-	"""Mask prediction service using TorchScript model."""
 	DEFAULT_TRANS = ['Mono', 'HWC2CHW']
 	DEFAULT_SLICING_WIDTH = 512
 	def __init__(self, model_path, device='cuda', trans=None, slicing_width=None):
-		super().__init__(model_path, device)
 		self.composer = Composer(trans or self.DEFAULT_TRANS)
 		self.slicing_width = slicing_width or self.DEFAULT_SLICING_WIDTH
@@ -70,7 +135,8 @@ class MaskService(TorchScriptPredictor):
 			batch = torch.from_numpy(staves).to(self.device)
 			# Inference
-			output = self.run_inference(batch)  # (batch, channel, height, width)
 			# Splice output
 			hotmap = splice_output_tensor(output, soft=True)  # (channel, height, width)

 """
 Mask prediction service.
 Generates staff foreground/background mask.
+Supports both TorchScript (.pt) and state_dict (.chkpt) model formats.
 """
+import os
+import logging
 import numpy as np
 import torch
+import yaml
 import PIL.Image
+from predictors.torchscript_predictor import resolve_model_path
+from predictors.unet import UNet
 from common.image_utils import (
 	array_from_image_stream, slice_feature, splice_output_tensor,
 	mask_to_alpha, encode_image_base64, encode_image_bytes,
 from common.transform import Composer
+class _ScoreWidgetsMask(torch.nn.Module):
+	"""ScoreWidgetsMask architecture for loading .chkpt checkpoints."""
+	def __init__(self, in_channels=1, mask_channels=2, unet_depth=5, unet_init_width=32):
+		super().__init__()
+		self.mask = UNet(in_channels, mask_channels, depth=unet_depth, init_width=unet_init_width)
+	def forward(self, x):
+		return torch.sigmoid(self.mask(x))
+def _load_mask_model(model_path, device):
+	"""Load mask model, handling both TorchScript and state_dict formats."""
+	resolved = resolve_model_path(model_path)
+	# Try TorchScript first
+	try:
+		model = torch.jit.load(resolved, map_location=device)
+		model.eval()
+		logging.info('MaskService: TorchScript model loaded: %s', resolved)
+		return model
+	except Exception as e:
+		logging.info('MaskService: not TorchScript (%s), trying state_dict...', str(e)[:60])
+	# Read model config from .state.yaml
+	model_dir = os.path.dirname(resolved)
+	state_file = os.path.join(model_dir, '.state.yaml')
+	unet_depth = 5
+	unet_init_width = 32
+	if os.path.exists(state_file):
+		with open(state_file, 'r') as f:
+			state = yaml.safe_load(f)
+		mask_config = state.get('model', {}).get('args', {}).get('mask', {})
+		unet_depth = mask_config.get('unet_depth', 5)
+		unet_init_width = mask_config.get('unet_init_width', 32)
+	model = _ScoreWidgetsMask(unet_depth=unet_depth, unet_init_width=unet_init_width)
+	checkpoint = torch.load(resolved, map_location=device, weights_only=False)
+	# Handle different checkpoint formats
+	state_dict = checkpoint
+	if isinstance(checkpoint, dict):
+		if 'model' in checkpoint:
+			state_dict = checkpoint['model']
+	# ScoreWidgetsMask saves as {'mask': {UNet weights}}
+	if isinstance(state_dict, dict) and 'mask' in state_dict:
+		model.mask.load_state_dict(state_dict['mask'])
+	else:
+		# Try loading directly (may have 'mask.' prefix from nn.Module default)
+		model.load_state_dict(state_dict, strict=False)
+	model.eval()
+	model.to(device)
+	logging.info('MaskService: state_dict loaded: %s (depth=%d, width=%d)',
+				 resolved, unet_depth, unet_init_width)
+	return model
 class StaffMask:
 	"""Staff mask representation."""
 		}
+class MaskService:
+	"""Mask prediction service. Supports TorchScript and state_dict formats."""
 	DEFAULT_TRANS = ['Mono', 'HWC2CHW']
 	DEFAULT_SLICING_WIDTH = 512
 	def __init__(self, model_path, device='cuda', trans=None, slicing_width=None):
+		self.device = device
+		self.model = _load_mask_model(model_path, device)
 		self.composer = Composer(trans or self.DEFAULT_TRANS)
 		self.slicing_width = slicing_width or self.DEFAULT_SLICING_WIDTH
 			batch = torch.from_numpy(staves).to(self.device)
 			# Inference
+			with torch.no_grad():
+				output = self.model(batch)  # (batch, channel, height, width)
 			# Splice output
 			hotmap = splice_output_tensor(output, soft=True)  # (channel, height, width)