"""Data utilities for loading and preprocessing datasets.""" import json from typing import Dict, Any, Tuple from datasets import load_dataset, Dataset from transformers import AutoTokenizer import numpy as np def load_config(config_path: str = "config.json") -> Dict[str, Any]: """Load configuration from JSON file.""" with open(config_path, "r") as f: return json.load(f) def load_and_prepare_dataset( dataset_name: str, tokenizer_name: str, train_size: int = 4000, eval_size: int = 1000, test_size: int = 500, max_length: int = 512 ) -> Tuple[Dataset, Dataset, Dataset]: """ Load dataset and prepare for training. Args: dataset_name: Name of the dataset (e.g., 'imdb') tokenizer_name: Name of the tokenizer to use train_size: Number of training samples eval_size: Number of evaluation samples test_size: Number of test samples max_length: Maximum sequence length Returns: Tuple of (train_dataset, eval_dataset, test_dataset) """ # Load dataset dataset = load_dataset(dataset_name) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) def tokenize_function(examples): return tokenizer( examples["text"], padding="max_length", truncation=True, max_length=max_length ) # Tokenize dataset tokenized_dataset = dataset.map(tokenize_function, batched=True) # Prepare train/eval/test splits train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(train_size)) # Use test set for both eval and final test test_full = tokenized_dataset["test"].shuffle(seed=42) eval_dataset = test_full.select(range(eval_size)) test_dataset = test_full.select(range(eval_size, eval_size + test_size)) return train_dataset, eval_dataset, test_dataset def prepare_labels_for_classification(dataset: Dataset) -> Dataset: """Ensure labels are properly formatted for classification.""" def format_labels(example): example["labels"] = example["label"] return example return dataset.map(format_labels) class DataCollector: """Custom data collector for handling various data preprocessing needs.""" def __init__(self, tokenizer): self.tokenizer = tokenizer def __call__(self, features): """Standard data collation for transformer training.""" batch = self.tokenizer.pad(features, return_tensors="pt") return batch def compute_class_distribution(dataset: Dataset) -> Dict[str, float]: """Compute class distribution in the dataset.""" labels = dataset["label"] if "label" in dataset.column_names else dataset["labels"] unique, counts = np.unique(labels, return_counts=True) total = len(labels) distribution = {} for label, count in zip(unique, counts): distribution[f"class_{label}"] = count / total return distribution def get_sample_texts(dataset: Dataset, n_samples: int = 5) -> list: """Get sample texts from dataset for inspection.""" indices = np.random.choice(len(dataset), n_samples, replace=False) samples = [] for idx in indices: sample = dataset[idx] samples.append({ "text": sample["text"][:200] + "..." if len(sample["text"]) > 200 else sample["text"], "label": sample["label"] if "label" in sample else sample["labels"] }) return samples