Spaces:
Sleeping
Sleeping
| """ | |
| A function to prepare the dataloaders | |
| """ | |
| # Import packages | |
| import glob | |
| import torch | |
| import pandas as pd | |
| import numpy as np | |
| from torch.utils.data import DataLoader, TensorDataset | |
| from llmprop_utils import * | |
| np.random.seed(42) | |
| def tokenize(tokenizer, dataframe, max_length, pooling='cls'): | |
| input_ids = [] | |
| attention_masks = [] | |
| for descr in dataframe.description.tolist(): | |
| if pooling == 'cls': | |
| text = "[CLS] " + str(descr) | |
| else: | |
| text = str(descr) | |
| encoding = tokenizer( | |
| text, | |
| add_special_tokens=True, | |
| padding='max_length', | |
| truncation=True, | |
| max_length=max_length, | |
| return_attention_mask=True | |
| ) | |
| input_ids.append(encoding['input_ids']) | |
| attention_masks.append(encoding['attention_mask']) | |
| return input_ids, attention_masks | |
| def create_dataloaders( | |
| tokenizer, | |
| dataframe, | |
| max_length, | |
| batch_size, | |
| property_value="band_gap", | |
| pooling='cls', | |
| normalize=False, | |
| normalizer='z_norm', | |
| shuffle=None, | |
| # ✅ NEW: pass global stats | |
| labels_mean=None, | |
| labels_std=None, | |
| labels_min=None, | |
| labels_max=None, | |
| ): | |
| input_ids, attention_masks = tokenize(tokenizer, dataframe, max_length, pooling=pooling) | |
| labels = dataframe[property_value].to_numpy() | |
| input_tensor = torch.tensor(input_ids, dtype=torch.long) | |
| mask_tensor = torch.tensor(attention_masks, dtype=torch.long) | |
| labels_tensor = torch.tensor(labels, dtype=torch.float32) | |
| if normalize: | |
| if normalizer == 'z_norm': | |
| # ✅ FIX: use SAME stats as training loop | |
| if labels_mean is not None and labels_std is not None: | |
| mean = labels_mean | |
| std = labels_std | |
| normalized_labels = (labels_tensor - mean) / (std + 1e-8) | |
| else: | |
| print("[WARNING] No global mean/std passed — fallback to local normalization") | |
| normalized_labels = z_normalizer(labels_tensor) | |
| elif normalizer == 'mm_norm': | |
| if labels_min is not None and labels_max is not None: | |
| lo = labels_min | |
| hi = labels_max | |
| normalized_labels = (labels_tensor - lo) / (hi - lo + 1e-8) | |
| else: | |
| normalized_labels = min_max_scaling(labels_tensor) | |
| elif normalizer == 'ls_norm': | |
| normalized_labels = log_scaling(labels_tensor) | |
| elif normalizer == 'no_norm': | |
| normalized_labels = labels_tensor | |
| dataset = TensorDataset(input_tensor, mask_tensor, labels_tensor, normalized_labels) | |
| else: | |
| dataset = TensorDataset(input_tensor, mask_tensor, labels_tensor) | |
| if shuffle is None: | |
| shuffle = normalize | |
| dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) | |
| return dataloader #Dataloader which arrange the input sequences, attention masks, and labels in batchesand transform the to tensors | |
| input_ids, attention_masks = tokenize(tokenizer, dataframe, max_length, pooling=pooling) | |
| labels = dataframe[property_value].to_numpy() | |
| input_tensor = torch.tensor(input_ids, dtype=torch.long) | |
| mask_tensor = torch.tensor(attention_masks, dtype=torch.long) | |
| labels_tensor = torch.tensor(labels, dtype=torch.float32) | |
| if normalize: | |
| if normalizer == 'z_norm': | |
| normalized_labels = z_normalizer(labels_tensor) | |
| elif normalizer == 'mm_norm': | |
| normalized_labels = min_max_scaling(labels_tensor) | |
| elif normalizer == 'ls_norm': | |
| normalized_labels = log_scaling(labels_tensor) | |
| elif normalizer == 'no_norm': | |
| normalized_labels = labels_tensor | |
| dataset = TensorDataset(input_tensor, mask_tensor, labels_tensor, normalized_labels) | |
| else: | |
| dataset = TensorDataset(input_tensor, mask_tensor, labels_tensor) | |
| if shuffle is None: | |
| # Default behavior: shuffle training data (normalize=True), keep eval deterministic. | |
| shuffle = normalize | |
| dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) | |
| return dataloader | |