mosaic-light / foundation_bert.py

Fix remote train_config loading

86ec786 verified about 16 hours ago

16.6 kB

	import torch
	import os
	import yaml
	from pathlib import Path
	# from ..utils.masked_data_modeling_loss import MaskedDataLossWithSoftmax
	# from ..utils.contrastive_loss import ContrastiveLoss
	# from ..utils.yaml_util import MyLoader
	from dataclasses import dataclass
	from transformers import ModernBertModel, ModernBertConfig, PretrainedConfig
	from transformers.utils import cached_file
	from typing import Optional, Union

	# import yaml
	class MyLoader(yaml.SafeLoader):
	# returns
	def construct_mapping(self, args, *kwargs):
	super().add_constructor(None, construct_undefined)
	# when loading we want to skip keys that require construction,
	mapping = super().construct_mapping(args, *kwargs)

	return mapping

	import typing
	class Tagged(typing.NamedTuple):
	tag: str
	value: object

	def construct_undefined(self, node):
	if isinstance(node, yaml.nodes.ScalarNode):
	value = self.construct_scalar(node)
	elif isinstance(node, yaml.nodes.SequenceNode):
	value = self.construct_sequence(node)
	elif isinstance(node, yaml.nodes.MappingNode):
	value = self.construct_mapping(node)
	else:
	assert False, f"unexpected node: {node!r}"
	return Tagged(node.tag, value)

	@dataclass
	class FoundationOutput:
	loss: torch.Tensor = None
	logits: torch.Tensor = None
	num_output: torch.Tensor = None
	est_err_output: torch.Tensor = None
	hidden_states: torch.Tensor = None
	masked_loss: torch.Tensor = None
	num_loss: torch.Tensor = None
	est_err_loss: torch.Tensor = None


	@dataclass
	class FoundationBertConfig:
	vocab_size: int
	hidden_size: int
	num_hidden_layers: int
	num_attention_heads: int
	intermediate_size: int
	hidden_dropout_prob: float
	attention_probs_dropout_prob: float
	pad_token_id: int
	classifier_dropout: float
	max_position_embeddings: int
	contrastive_temperature: float
	loss_weights: dict
	use_xval_loss: bool = True
	use_mlm_loss: bool = True
	use_regression_loss: bool = False
	use_contrastive_loss: bool = False
	transform_numeric: bool = False
	use_sdpa_attention: bool = True

	def to_dict(self):
	return {k: getattr(self, k) for k in self.__dataclass_fields__.keys()}

	class FoundationBert(ModernBertModel):
	def __init__(self,
	config: FoundationBertConfig = None,
	use_mlm_loss: bool = False,
	use_regression_loss: bool = True,
	use_contrastive_loss: bool = False,
	use_xval_loss: bool = False,
	transform_numeric: bool = False,
	*args,
	**kwargs):
	self.gconfig = config
	# print(f"⚠️ FoundationBert.__init__: {self.gconfig=}")
	bert_conf = ModernBertConfig(
	vocab_size=config.vocab_size,
	hidden_size=config.hidden_size,
	num_hidden_layers=config.num_hidden_layers,
	num_attention_heads=config.num_attention_heads,
	intermediate_size=config.intermediate_size,
	hidden_dropout_prob=config.hidden_dropout_prob,
	attention_probs_dropout_prob=config.attention_probs_dropout_prob,
	pad_token_id=config.pad_token_id,
	max_position_embeddings=config.max_position_embeddings,
	_attn_implementation='sdpa'
	)
	self.gconfig.transform_numeric = transform_numeric
	super().__init__(bert_conf,)
	try:
	if not self.gconfig.use_mlm_loss and not self.gconfig.use_regression_loss and not self.gconfig.use_contrastive_loss:
	raise ValueError("At least one loss must be enabled")
	self.loss_mod = float(self.gconfig.use_mlm_loss) + float(self.gconfig.use_regression_loss) + float(self.gconfig.use_contrastive_loss) + float(self.gconfig.use_xval_loss)
	except:
	self.gconfig.use_mlm_loss = use_mlm_loss
	self.gconfig.use_regression_loss = use_regression_loss
	self.gconfig.use_contrastive_loss = use_contrastive_loss
	self.gconfig.use_xval_loss = use_xval_loss
	self.loss_mod = float(self.gconfig.use_mlm_loss) + float(self.gconfig.use_regression_loss) + float(self.gconfig.use_contrastive_loss) + float(self.gconfig.use_xval_loss)

	self.dataset_path = kwargs.get('dataset_path', None)

	self.vector_shape = kwargs['vector_shape']
	self.scalar_shape = kwargs['scalar_shape']
	self.mask_token = kwargs['mask_token']

	# self.scalar_keys = [
	# 'redshift',
	# 'halo_mass',
	# 'stellar_mass',
	# ]
	# self.vector_keys = [
	# 'SED',
	# 'SFH',
	# 'mag_{band}_spherex',
	# 'mag_{band}_lsst',
	# ]

	# convert modality names to 'scalars' or keep as is if in vector shape
	self.modalscalars = [m if m in self.vector_shape else 'scalars' for m in self.modalities]
	# remove duplicates while preserving order
	self.modalscalars = list(dict.fromkeys(self.modalscalars))

	print(f"✅ FoundationBert.__init__ is called with {kwargs=}, {self.modalscalars=}, {self.dataset_path=} ✅")

	self.embedding = torch.nn.ModuleDict() # modality specific embedding layers
	self.num_head = torch.nn.ModuleDict() # modality specific regression heads
	# create modality specific layers
	for modality in self.modalscalars:
	self.embedding[modality] = torch.nn.Linear(1, config.hidden_size) # input.shape -> ouput.shape: (B, L, 1) -> (B, L, H)
	self.num_head[modality] = torch.nn.Sequential(
	torch.nn.Linear(config.hidden_size, config.hidden_size),
	torch.nn.LayerNorm(config.hidden_size),
	torch.nn.GELU(),
	torch.nn.Linear(config.hidden_size, config.hidden_size // 2),
	torch.nn.GELU(),
	torch.nn.Linear(config.hidden_size // 2, 1)
	)

	# self.position_embeddings = torch.nn.Embedding(config.max_position_embeddings, config.hidden_size)
	self.embed_dropout = torch.nn.Dropout(config.hidden_dropout_prob)

	# self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) # isn't used currently
	# self.xval_loss = torch.nn.MSELoss(reduction='none') # isn't used currently
	# self.mlm_loss = MaskedDataLossWithSoftmax(ignore=-100, reduction='none') # isn't used currently
	self.distributed_loss = False

	@property
	def modalities(self):
	return self.vector_shape \| self.scalar_shape

	@classmethod
	def from_pretrained(self,
	pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
	*model_args,
	config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
	cache_dir: Optional[Union[str, os.PathLike]] = None,
	ignore_mismatched_sizes: bool = False,
	force_download: bool = False,
	local_files_only: bool = False,
	token: Optional[Union[str, bool]] = None,
	revision: str = "main",
	use_safetensors: bool = None,
	**kwargs,
	):
	"""
	Modification to correctly handle loading extraneous parameters for GBert
	"""
	path = Path(pretrained_model_name_or_path)
	if 'checkpoint' in str(pretrained_model_name_or_path):
	model_config = path.parent / 'train_config.yaml'
	elif path.is_dir():
	model_config = path / 'train_config.yaml'
	else:
	model_config = cached_file(
	pretrained_model_name_or_path,
	'train_config.yaml',
	cache_dir=cache_dir,
	force_download=force_download,
	local_files_only=local_files_only,
	token=token,
	revision=revision,
	)

	with open(model_config, 'r') as f:
	config = yaml.load(f, Loader=MyLoader)
	kwargs['modalities'] = config['modalities']
	kwargs['dataset_path'] = config['dataset_path']
	kwargs['mask_token'] = config['mask_token']

	if 'vector_shape' not in kwargs and 'vector_shape' in config:
	kwargs['vector_shape'] = config['vector_shape']
	if 'scalar_shape' not in kwargs and 'scalar_shape' in config:
	kwargs['scalar_shape'] = config['scalar_shape']

	print(f"✅ Foundationbert.from_pretrained is called with {model_config=} and {kwargs=} ✅")
	return super().from_pretrained(
	pretrained_model_name_or_path,
	**config['model_config'],
	**kwargs
	)

	def pool_output(self,
	embeddings: torch.Tensor,
	attention_mask: torch.Tensor,
	use_last: bool = False
	) -> torch.Tensor:
	"""Average pool the hidden states using the attention mask.

	Parameters
	----------
	embeddings : torch.Tensor
	The hidden states to pool (B, SeqLen, HiddenDim).
	attention_mask : torch.Tensor
	The attention mask for the hidden states (B, SeqLen).

	Returns
	-------
	torch.Tensor
	The pooled embeddings (B, HiddenDim).
	"""
	# Get the sequence lengths
	sl_mod = 1 if use_last else 2
	seq_lengths = attention_mask.sum(axis=1)
	# Set the attention mask to 0 for start and end tokens
	new_attention = attention_mask.clone()
	new_attention[:, 0] = attention_mask[:,0] * 0
	new_attention[:, seq_lengths - sl_mod] = 0 * attention_mask[:, seq_lengths - sl_mod]

	# Create a mask for the pooling operation (B, SeqLen, HiddenDim)
	pool_mask = new_attention.unsqueeze(-1).expand(embeddings.shape).to(embeddings.device)
	# Sum the embeddings over the sequence length (use the mask to avoid
	# pad, start, and stop tokens)
	sum_embeds = torch.sum(embeddings * pool_mask, 1)
	# Avoid division by zero for zero length sequences by clamping
	# sum_mask = torch.clamp(pool_mask.sum(1), min=1e-9)
	seq_lengths = torch.clamp(seq_lengths, min=1).unsqueeze(-1) # Shape (B, 1) to broadcast
	# Compute mean pooled embeddings for each sequence
	return sum_embeds / seq_lengths


	def last_token_pool(
	self,
	embeddings: torch.Tensor,
	attention_mask: torch.Tensor,
	) -> torch.Tensor:
	"""Pool the last hidden states using the attention mask.

	Parameters
	----------
	embeddings : torch.Tensor
	The last hidden states to pool (B, SeqLen, HiddenDim).
	attention_mask : torch.Tensor
	The attention mask for the hidden states (B, SeqLen).

	Returns
	-------
	torch.Tensor
	The pooled embeddings (B, HiddenDim).
	"""
	left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
	if left_padding:
	return embeddings[:, -1]
	else:
	sequence_lengths = attention_mask.sum(dim=1) - 1
	batch_size = embeddings.shape[0]
	return embeddings[
	torch.arange(batch_size, device=embeddings.device),
	sequence_lengths,
	]

	def forward(self, inputs, return_input_label_mapping=False):
	"""
	Forward pass that computes predictions for each modality.

	Args:
	input_label_mapping (dict): A dictionary containing inputs and labels for different modalities.

	Returns:
	outputs (dict): A dictionary containing the logits and error logits for each modality.
	"""

	# Initialize the dictionary for the dynamic input-label mapping
	input_label_mapping = {}
	combined = []
	for src_modality in self.modalscalars:
	# Add the modality's input and label data to the input_label_mapping
	input_label_mapping[src_modality] = {
	'input': inputs[f"input_{src_modality}"], # Input data
	'labels': inputs[f"labels_{src_modality}"] # Corresponding labels
	}

	input_data = input_label_mapping[src_modality]['input'] # get input data
	label = input_label_mapping[src_modality]['labels'] # get label data (for masking)
	input_data = torch.where(label, self.mask_token, input_data) # apply masking

	x = self.embedding[src_modality](input_data.unsqueeze(-1)) # shape: (B, L, H)
	x = torch.nn.functional.silu(x)
	combined.append(x) # combine all modalities

	combined = torch.cat(combined, dim=1) # Concatenate along the sequence length dimension

	position_ids = torch.arange(combined.size(1)).unsqueeze(0).to(combined.device) # shape: (1, L)
	# combined += self.position_embeddings(position_ids) # add position embedding
	combined = self.embed_dropout(combined)

	# x = self.encoder(combined, output_hidden_states=True).last_hidden_state # encode the combined input
	hidden_states = combined
	for encoder_layer in self.layers:
	hidden_states = encoder_layer(hidden_states, position_ids = position_ids)[0]
	x = self.final_norm(hidden_states)

	start = 0
	outputs = {}
	# Iterate over each target modality to compute logits
	for tgt_modality in self.modalscalars:
	length = input_label_mapping[tgt_modality]['input'].shape[1] # get sequence length of the modality
	x_t = x[:, start:start+length, :] # slice the encoded output for each modality
	outputs[f"{tgt_modality}_logits"] = self.num_head[tgt_modality](x_t) # modality specific regression head

	start += length # update start index for next modality

	if getattr(self, 'save_umap_for', None):
	pooled = x_t.mean(dim=1) # Mean pooling over the sequence length dimension
	self.save_pooled_embedding(pooled) # saved for UMAP visualization

	return (outputs, input_label_mapping) if return_input_label_mapping else outputs

	def save_pooled_embedding(self, features):
	"""
	Save the last hidden state to a file.
	"""
	import h5py
	fname = Path(self.save_umap_for)
	fname.parent.mkdir(parents=True, exist_ok=True)

	features = features.detach().cpu().numpy()

	if fname.exists():
	with h5py.File(fname, 'r+') as f:
	old_size = f['features'].shape[0] # get current size
	new_size = old_size + features.shape[0] # calculate new size

	f['features'].resize((new_size, features.shape[-1])) # resize dataset
	f['features'][old_size:] = features # append new features

	else:
	with h5py.File(fname, 'w') as f:
	f.create_dataset('features', data=features, maxshape=(None, features.shape[-1]), chunks=True)

	def get_retrieval_embedding(
	self,
	inputs,
	pooling: str = "mean",
	normalize: bool = True,
	) -> torch.Tensor:
	"""
	Build a single embedding per sample for kNN-style retrieval.

	Parameters
	----------
	inputs : dict
	Batch dict with `input_<modality>` and `labels_<modality>` entries.
	pooling : str
	`mean` (default) or `last`.
	normalize : bool
	L2-normalize output embeddings for cosine/inner-product search.
	"""
	combined = []
	for src_modality in self.modalscalars:
	input_data = inputs[f"input_{src_modality}"]
	label = inputs[f"labels_{src_modality}"]
	input_data = torch.where(label, self.mask_token, input_data)
	x = self.embedding[src_modality](input_data.unsqueeze(-1))
	x = torch.nn.functional.silu(x)
	combined.append(x)

	combined = torch.cat(combined, dim=1)
	position_ids = torch.arange(combined.size(1)).unsqueeze(0).to(combined.device)
	combined = self.embed_dropout(combined)

	hidden_states = combined
	for encoder_layer in self.layers:
	hidden_states = encoder_layer(hidden_states, position_ids=position_ids)[0]
	hidden_states = self.final_norm(hidden_states)

	if pooling == "last":
	embedding = hidden_states[:, -1, :]
	else:
	embedding = hidden_states.mean(dim=1)

	if normalize:
	embedding = torch.nn.functional.normalize(embedding, p=2, dim=-1)

	return embedding