| |
| """ |
| Task 1: Next-Word Prediction MLP Experiment Runner. |
| |
| This script trains an MLP model for next-word prediction. It can be configured |
| via command-line arguments to: |
| - Use either the 'shakespeare' or 'linux' dataset. |
| - Enable or disable regularization (Dropout and L2 Weight Decay). |
| |
| It saves the best-performing model, the final model, and the full training history. |
| |
| You can run multiple instances of this script in separate terminals to conduct |
| parallel experiments. |
| """ |
| import os |
| import re |
| import json |
| import time |
| import argparse |
| from collections import Counter |
|
|
| import torch |
| import torch.nn as nn |
| import torch.optim as optim |
| from torch.utils.data import TensorDataset, DataLoader |
| from tqdm import tqdm |
| import numpy as np |
| import matplotlib.pyplot as plt |
| from sklearn.manifold import TSNE |
| import random |
|
|
| |
|
|
| def download_and_preprocess_text(dataset_name): |
| """Downloads and preprocesses the specified dataset.""" |
| if dataset_name == 'shakespeare': |
| url = 'https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt' |
| filename = 'shakespeare_input.txt' |
| if not os.path.exists(filename): |
| print(f"Downloading {filename}...") |
| os.system(f"wget {url}") |
| with open(filename, "r", encoding='utf-8') as f: |
| text = f.read() |
| text = re.sub(r'[^a-zA-Z0-9 \.]', '', text.lower()) |
| text = re.sub(r'\s+', ' ', text).strip() |
| return text |
| elif dataset_name == 'linux': |
| url = 'https://cs.stanford.edu/people/karpathy/char-rnn/linux_input.txt' |
| filename = 'linux_input.txt' |
| if not os.path.exists(filename): |
| print(f"Downloading {filename}...") |
| os.system(f"wget {url}") |
| with open(filename, "r", encoding='utf-8', errors='ignore') as f: |
| text = f.read() |
| lines = text.split('\n') |
| processed_lines = [] |
| for line in lines: |
| processed_line = re.sub(r'[^\w\s\.\(\)\[\]\{\}\=\+\-\*\/,;:"\'#<>&|!~`?]', '', line) |
| processed_lines.append(processed_line.strip()) |
| return ' \n '.join(processed_lines) |
| else: |
| raise ValueError("Invalid dataset name. Choose 'shakespeare' or 'linux'.") |
|
|
| def create_vocabulary_and_pairs(text, context_window_size): |
| """Creates vocabulary, reports frequencies, and generates context-target pairs.""" |
| print("Tokenizing text...") |
| tokens = text.split(' ') |
| tokens = [token for token in tokens if token] |
|
|
| word_counts = Counter(tokens) |
| print("\n--- Vocabulary Report ---") |
| print(f"10 Most Frequent Words: {word_counts.most_common(10)}") |
| print(f"10 Least Frequent Words: {word_counts.most_common()[:-11:-1]}") |
|
|
| vocab = sorted(list(set(tokens))) |
| word_to_idx = {word: i+1 for i, word in enumerate(vocab)} |
| word_to_idx['<pad>'] = 0 |
| idx_to_word = {i: word for word, i in word_to_idx.items()} |
| vocab_size = len(word_to_idx) |
| print(f"Vocabulary Size: {vocab_size}") |
|
|
| indexed_tokens = [word_to_idx[word] for word in tokens] |
| contexts, targets = [], [] |
| for i in range(len(indexed_tokens) - context_window_size): |
| contexts.append(indexed_tokens[i:i+context_window_size]) |
| targets.append(indexed_tokens[i+context_window_size]) |
|
|
| return torch.tensor(contexts, dtype=torch.long), torch.tensor(targets, dtype=torch.long), word_to_idx, idx_to_word |
|
|
| |
|
|
| class NextWordPredictor(nn.Module): |
| def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim, use_dropout=False): |
| super().__init__() |
| self.use_dropout = use_dropout |
| self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0) |
| self.fc1 = nn.Linear(context_size * embedding_dim, hidden_dim) |
| self.relu = nn.ReLU() |
| if self.use_dropout: |
| self.dropout = nn.Dropout(p=0.5) |
| self.fc2 = nn.Linear(hidden_dim, hidden_dim) |
| self.fc3 = nn.Linear(hidden_dim, vocab_size) |
|
|
| def forward(self, x): |
| embedded = self.embedding(x).view(x.size(0), -1) |
| out = self.relu(self.fc1(embedded)) |
| if self.use_dropout: |
| out = self.dropout(out) |
| out = self.relu(self.fc2(out)) |
| if self.use_dropout: |
| out = self.dropout(out) |
| out = self.fc3(out) |
| return out |
|
|
| |
|
|
| def main(args): |
| """Main training and evaluation function.""" |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| if not torch.cuda.is_available(): |
| print("WARNING: CUDA not available, running on CPU.") |
| |
| file_suffix = "_regularized" if args.use_regularization else "_normal" |
| run_name = f"{args.dataset}{file_suffix}" |
|
|
| print(f"--- Starting Run: {run_name} on {device} ---") |
|
|
| |
| raw_text = download_and_preprocess_text(args.dataset) |
| contexts, targets, word_to_idx, idx_to_word = create_vocabulary_and_pairs(raw_text, args.context_size) |
| vocab_size = len(word_to_idx) |
| |
| with open(f'{run_name}_word_to_idx.json', 'w') as f: |
| json.dump(word_to_idx, f) |
|
|
| dataset = TensorDataset(contexts, targets) |
| train_size = int(0.8 * len(dataset)) |
| val_size = len(dataset) - train_size |
| train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size]) |
|
|
| train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=4) |
| val_loader = DataLoader(val_dataset, batch_size=args.batch_size, pin_memory=True, num_workers=4) |
|
|
| |
| model = NextWordPredictor( |
| vocab_size, args.embedding_dim, args.context_size, args.hidden_dim, use_dropout=args.use_regularization |
| ).to(device) |
| |
| weight_decay_val = 1e-4 if args.use_regularization else 0.0 |
| optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=weight_decay_val) |
| criterion = nn.CrossEntropyLoss(ignore_index=0) |
| scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available()) |
|
|
| |
| history = {'train_loss': [], 'val_loss': []} |
| best_val_loss = float('inf') |
|
|
| for epoch in range(args.epochs): |
| model.train() |
| total_train_loss = 0.0 |
| pbar_desc = f"Epoch {epoch+1}/{args.epochs} [{run_name} Train]" |
| train_pbar = tqdm(train_loader, desc=pbar_desc) |
| for inputs, labels in train_pbar: |
| inputs, labels = inputs.to(device), labels.to(device) |
| optimizer.zero_grad() |
| with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): |
| outputs = model(inputs) |
| loss = criterion(outputs, labels) |
| scaler.scale(loss).backward() |
| scaler.step(optimizer) |
| scaler.update() |
| total_train_loss += loss.item() |
| |
| avg_train_loss = total_train_loss / len(train_loader) |
| history['train_loss'].append(avg_train_loss) |
|
|
| |
| model.eval() |
| total_val_loss = 0.0 |
| with torch.no_grad(): |
| pbar_desc = f"Epoch {epoch+1}/{args.epochs} [{run_name} Val]" |
| val_pbar = tqdm(val_loader, desc=pbar_desc) |
| for inputs, labels in val_pbar: |
| inputs, labels = inputs.to(device), labels.to(device) |
| with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): |
| outputs = model(inputs) |
| loss = criterion(outputs, labels) |
| total_val_loss += loss.item() |
|
|
| avg_val_loss = total_val_loss / len(val_loader) |
| history['val_loss'].append(avg_val_loss) |
|
|
| print(f"[{run_name}] Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}") |
|
|
| |
| if avg_val_loss < best_val_loss: |
| best_val_loss = avg_val_loss |
| torch.save(model.state_dict(), f'{run_name}_model_best.pth') |
| print(f"[{run_name}] New best validation loss: {best_val_loss:.4f}. Saving best model.") |
| |
| |
| torch.save(model.state_dict(), f'{run_name}_model_latest.pth') |
| print(f"[{run_name}] Saved final model from epoch {args.epochs}.") |
|
|
| print(f"--- [{run_name}] Training Complete ---") |
| print(f"Final Best Validation Loss: {best_val_loss:.4f}") |
|
|
| |
| history_path = f'{run_name}_training_history.json' |
| with open(history_path, 'w') as f: |
| json.dump(history, f) |
| print(f"Training history saved to {history_path}") |
|
|
| plt.figure(figsize=(10, 5)) |
| plt.plot(history['train_loss'], label='Training Loss') |
| plt.plot(history['val_loss'], label='Validation Loss') |
| best_epoch = np.argmin(history['val_loss']) |
| plt.axvline(best_epoch, linestyle='--', color='r', label=f'Best Model (Epoch {best_epoch+1})') |
| plt.title(f'Training vs. Validation Loss ({run_name})') |
| plt.xlabel('Epochs') |
| plt.ylabel('Loss') |
| plt.legend() |
| plt.grid(True) |
| plt.savefig(f'{run_name}_loss_curve.png') |
| print(f"Loss curve saved to {run_name}_loss_curve.png") |
|
|
| |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Single-GPU MLP Experiment Runner") |
| |
| parser.add_argument('--dataset', type=str, required=True, choices=['shakespeare', 'linux'], help='Dataset to use.') |
| parser.add_argument('--use_regularization', action='store_true', help='Enable Dropout and L2 Weight Decay.') |
| |
| parser.add_argument('--context_size', type=int, default=5, help='Number of context words.') |
| parser.add_argument('--embedding_dim', type=int, default=64, help='Dimension of word embeddings.') |
| parser.add_argument('--hidden_dim', type=int, default=1024, help='Dimension of hidden layers.') |
| |
| parser.add_argument('--epochs', type=int, default=500, help='Number of training epochs.') |
| parser.add_argument('--batch_size', type=int, default=40960, help='Batch size for training.') |
| parser.add_argument('--lr', type=float, default=1e-3, help='Learning rate.') |
|
|
| args = parser.parse_args() |
| main(args) |