| import torch
|
| import torch.nn as nn
|
| from torch.utils.data import DataLoader, Dataset
|
| from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, AlbertConfig
|
| from datasets import Dataset as HFDataset
|
| import pandas as pd
|
| import os
|
|
|
|
|
| model_dir = 'model'
|
| os.makedirs(model_dir, exist_ok=True)
|
|
|
|
|
| train_dataset = HFDataset.from_file('train/data-00000-of-00001.arrow')
|
| val_dataset = HFDataset.from_file('validation/data-00000-of-00001.arrow')
|
| test_dataset = HFDataset.from_file('test/data-00000-of-00001.arrow')
|
|
|
|
|
| train_df = train_dataset.to_pandas()
|
| val_df = val_dataset.to_pandas()
|
| test_df = test_dataset.to_pandas()
|
|
|
|
|
| train_df['content'] = train_df['content'].str.rstrip('?')
|
| val_df['content'] = val_df['content'].str.rstrip('?')
|
| test_df['content'] = test_df['content'].str.rstrip('?')
|
|
|
|
|
| train_df['rating'] = train_df['rating'].apply(lambda x: int(x >= 0.5))
|
| val_df['rating'] = val_df['rating'].apply(lambda x: int(x >= 0.5))
|
| test_df['rating'] = test_df['rating'].apply(lambda x: int(x >= 0.5))
|
|
|
|
|
| tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
|
|
|
|
| class QueryDataset(Dataset):
|
| def __init__(self, texts, labels, tokenizer, max_length=32):
|
| self.texts = texts
|
| self.labels = labels
|
| self.tokenizer = tokenizer
|
| self.max_length = max_length
|
|
|
| def __len__(self):
|
| return len(self.texts)
|
|
|
| def __getitem__(self, idx):
|
| text = str(self.texts[idx])
|
| label = int(self.labels[idx])
|
| encoding = self.tokenizer.encode_plus(
|
| text,
|
| add_special_tokens=True,
|
| max_length=self.max_length,
|
| padding='max_length',
|
| truncation=True,
|
| return_attention_mask=True,
|
| return_tensors='pt'
|
| )
|
|
|
| return {
|
| 'input_ids': encoding['input_ids'].flatten(),
|
| 'attention_mask': encoding['attention_mask'].flatten(),
|
| 'label': torch.tensor(label, dtype=torch.long)
|
| }
|
|
|
|
|
| train_dataset = QueryDataset(train_df['content'].values, train_df['rating'].values, tokenizer)
|
| val_dataset = QueryDataset(val_df['content'].values, val_df['rating'].values, tokenizer)
|
| test_dataset = QueryDataset(test_df['content'].values, test_df['rating'].values, tokenizer)
|
|
|
|
|
| batch_size = 128
|
| train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
| val_loader = DataLoader(val_dataset, batch_size=batch_size)
|
| test_loader = DataLoader(test_dataset, batch_size=batch_size)
|
|
|
|
|
| model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
|
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| model.to(device)
|
|
|
|
|
| optimizer = AdamW(model.parameters(), lr=1e-5)
|
| criterion = nn.CrossEntropyLoss()
|
|
|
|
|
| epochs = 4
|
| for epoch in range(epochs):
|
| model.train()
|
| total_loss = 0
|
| for batch in train_loader:
|
| input_ids = batch['input_ids'].to(device)
|
| attention_mask = batch['attention_mask'].to(device)
|
| labels = batch['label'].to(device)
|
|
|
| optimizer.zero_grad()
|
| outputs = model(input_ids, attention_mask=attention_mask)
|
| loss = criterion(outputs.logits, labels)
|
| loss.backward()
|
| optimizer.step()
|
|
|
| total_loss += loss.item()
|
|
|
| avg_loss = total_loss / len(train_loader)
|
| print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')
|
|
|
|
|
| model.eval()
|
| correct_predictions = 0
|
| total_predictions = 0
|
| with torch.no_grad():
|
| for batch in val_loader:
|
| input_ids = batch['input_ids'].to(device)
|
| attention_mask = batch['attention_mask'].to(device)
|
| labels = batch['label'].to(device)
|
|
|
| outputs = model(input_ids, attention_mask=attention_mask)
|
| preds = torch.argmax(outputs.logits, dim=1)
|
| correct_predictions += (preds == labels).sum().item()
|
| total_predictions += labels.size(0)
|
|
|
| accuracy = correct_predictions / total_predictions
|
| print(f'Validation Accuracy after Epoch {epoch + 1}: {accuracy:.4f}')
|
|
|
|
|
| model.save_pretrained(model_dir, safe_serialization=True)
|
| tokenizer.save_pretrained(model_dir)
|
|
|
|
|
| config = AlbertConfig.from_pretrained('albert-base-v2')
|
| config.num_labels = 2
|
| config.save_pretrained(model_dir)
|
|
|
| print(f"Model and all required files saved to {model_dir}")
|
|
|