| | import json |
| | import os |
| | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" |
| | os.environ["CUDA_VISIBLE_DEVICES"] = "2" |
| | import torch |
| | from torch.optim import AdamW |
| | from torch.utils.data import Dataset, DataLoader |
| | from transformers import DistilBertTokenizer, DistilBertForSequenceClassification |
| | from sklearn.preprocessing import LabelEncoder |
| |
|
| | |
| | DATA_PATH = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json" |
| | MODEL_NAME = "distilbert-base-uncased" |
| | MAX_LEN = 512 |
| | BATCH_SIZE = 8 |
| | EPOCHS = 3 |
| | SAVE_DIR = "/home/mshahidul/readctrl/code/text_classifier/distilbert_health_literacy" |
| |
|
| | with open(DATA_PATH, 'r') as f: |
| | raw_data = json.load(f) |
| |
|
| | |
| | class HealthLiteracyDataset(Dataset): |
| | def __init__(self, data, tokenizer, label_encoder, max_len): |
| | self.data = data |
| | self.tokenizer = tokenizer |
| | self.label_encoder = label_encoder |
| | self.max_len = max_len |
| |
|
| | def __len__(self): |
| | return len(self.data) |
| |
|
| | def __getitem__(self, item): |
| | entry = self.data[item] |
| | |
| | |
| | |
| | encoding = self.tokenizer.encode_plus( |
| | entry["fulltext"], |
| | entry["diff_label_texts"], |
| | add_special_tokens=True, |
| | max_length=self.max_len, |
| | padding='max_length', |
| | truncation=True, |
| | return_overflowing_tokens=False, |
| | return_attention_mask=True, |
| | return_tensors='pt', |
| | ) |
| |
|
| | label = self.label_encoder.transform([entry["label"]])[0] |
| |
|
| | return { |
| | 'input_ids': encoding['input_ids'].flatten(), |
| | 'attention_mask': encoding['attention_mask'].flatten(), |
| | 'labels': torch.tensor(label, dtype=torch.long) |
| | } |
| |
|
| | |
| | tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME) |
| | label_encoder = LabelEncoder() |
| | all_labels = [d['label'] for d in raw_data] |
| | label_encoder.fit(all_labels) |
| | num_labels = len(label_encoder.classes_) |
| |
|
| | dataset = HealthLiteracyDataset(raw_data, tokenizer, label_encoder, MAX_LEN) |
| | loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True) |
| |
|
| | |
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels) |
| | model.to(device) |
| |
|
| | |
| | optimizer = AdamW(model.parameters(), lr=2e-5) |
| |
|
| | model.train() |
| | for epoch in range(EPOCHS): |
| | for batch in loader: |
| | optimizer.zero_grad() |
| | |
| | input_ids = batch['input_ids'].to(device) |
| | attention_mask = batch['attention_mask'].to(device) |
| | labels = batch['labels'].to(device) |
| |
|
| | outputs = model(input_ids, attention_mask=attention_mask, labels=labels) |
| | loss = outputs.loss |
| | loss.backward() |
| | optimizer.step() |
| | |
| | print(f"Epoch {epoch + 1} complete. Loss: {loss.item():.4f}") |
| |
|
| | |
| | os.makedirs(SAVE_DIR, exist_ok=True) |
| | model.save_pretrained(SAVE_DIR) |
| | tokenizer.save_pretrained(SAVE_DIR) |
| | with open(os.path.join(SAVE_DIR, "label_encoder_classes.json"), "w") as f: |
| | json.dump(label_encoder.classes_.tolist(), f, indent=2) |