| | import torch
|
| | import torch.nn as nn
|
| | from torch.utils.data import DataLoader, Dataset
|
| | from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, AlbertConfig
|
| | from datasets import Dataset as HFDataset
|
| | import pandas as pd
|
| | import os
|
| |
|
| |
|
| | model_dir = 'model'
|
| | os.makedirs(model_dir, exist_ok=True)
|
| |
|
| |
|
| | train_dataset = HFDataset.from_file('train/data-00000-of-00001.arrow')
|
| | val_dataset = HFDataset.from_file('validation/data-00000-of-00001.arrow')
|
| | test_dataset = HFDataset.from_file('test/data-00000-of-00001.arrow')
|
| |
|
| |
|
| | train_df = train_dataset.to_pandas()
|
| | val_df = val_dataset.to_pandas()
|
| | test_df = test_dataset.to_pandas()
|
| |
|
| |
|
| | train_df['content'] = train_df['content'].str.rstrip('?')
|
| | val_df['content'] = val_df['content'].str.rstrip('?')
|
| | test_df['content'] = test_df['content'].str.rstrip('?')
|
| |
|
| |
|
| | train_df['rating'] = train_df['rating'].apply(lambda x: int(x >= 0.5))
|
| | val_df['rating'] = val_df['rating'].apply(lambda x: int(x >= 0.5))
|
| | test_df['rating'] = test_df['rating'].apply(lambda x: int(x >= 0.5))
|
| |
|
| |
|
| | tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
| |
|
| |
|
| | class QueryDataset(Dataset):
|
| | def __init__(self, texts, labels, tokenizer, max_length=32):
|
| | self.texts = texts
|
| | self.labels = labels
|
| | self.tokenizer = tokenizer
|
| | self.max_length = max_length
|
| |
|
| | def __len__(self):
|
| | return len(self.texts)
|
| |
|
| | def __getitem__(self, idx):
|
| | text = str(self.texts[idx])
|
| | label = int(self.labels[idx])
|
| | encoding = self.tokenizer.encode_plus(
|
| | text,
|
| | add_special_tokens=True,
|
| | max_length=self.max_length,
|
| | padding='max_length',
|
| | truncation=True,
|
| | return_attention_mask=True,
|
| | return_tensors='pt'
|
| | )
|
| |
|
| | return {
|
| | 'input_ids': encoding['input_ids'].flatten(),
|
| | 'attention_mask': encoding['attention_mask'].flatten(),
|
| | 'label': torch.tensor(label, dtype=torch.long)
|
| | }
|
| |
|
| |
|
| | train_dataset = QueryDataset(train_df['content'].values, train_df['rating'].values, tokenizer)
|
| | val_dataset = QueryDataset(val_df['content'].values, val_df['rating'].values, tokenizer)
|
| | test_dataset = QueryDataset(test_df['content'].values, test_df['rating'].values, tokenizer)
|
| |
|
| |
|
| | batch_size = 128
|
| | train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
| | val_loader = DataLoader(val_dataset, batch_size=batch_size)
|
| | test_loader = DataLoader(test_dataset, batch_size=batch_size)
|
| |
|
| |
|
| | model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
|
| | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| | model.to(device)
|
| |
|
| |
|
| | optimizer = AdamW(model.parameters(), lr=1e-5)
|
| | criterion = nn.CrossEntropyLoss()
|
| |
|
| |
|
| | epochs = 4
|
| | for epoch in range(epochs):
|
| | model.train()
|
| | total_loss = 0
|
| | for batch in train_loader:
|
| | input_ids = batch['input_ids'].to(device)
|
| | attention_mask = batch['attention_mask'].to(device)
|
| | labels = batch['label'].to(device)
|
| |
|
| | optimizer.zero_grad()
|
| | outputs = model(input_ids, attention_mask=attention_mask)
|
| | loss = criterion(outputs.logits, labels)
|
| | loss.backward()
|
| | optimizer.step()
|
| |
|
| | total_loss += loss.item()
|
| |
|
| | avg_loss = total_loss / len(train_loader)
|
| | print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')
|
| |
|
| |
|
| | model.eval()
|
| | correct_predictions = 0
|
| | total_predictions = 0
|
| | with torch.no_grad():
|
| | for batch in val_loader:
|
| | input_ids = batch['input_ids'].to(device)
|
| | attention_mask = batch['attention_mask'].to(device)
|
| | labels = batch['label'].to(device)
|
| |
|
| | outputs = model(input_ids, attention_mask=attention_mask)
|
| | preds = torch.argmax(outputs.logits, dim=1)
|
| | correct_predictions += (preds == labels).sum().item()
|
| | total_predictions += labels.size(0)
|
| |
|
| | accuracy = correct_predictions / total_predictions
|
| | print(f'Validation Accuracy after Epoch {epoch + 1}: {accuracy:.4f}')
|
| |
|
| |
|
| | model.save_pretrained(model_dir, safe_serialization=True)
|
| | tokenizer.save_pretrained(model_dir)
|
| |
|
| |
|
| | config = AlbertConfig.from_pretrained('albert-base-v2')
|
| | config.num_labels = 2
|
| | config.save_pretrained(model_dir)
|
| |
|
| | print(f"Model and all required files saved to {model_dir}")
|
| |
|