File size: 3,272 Bytes
9c6961c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import json
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# 1. Configuration & Data Loading
DATA_PATH = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json"
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 3
SAVE_DIR = "/home/mshahidul/readctrl/code/text_classifier/distilbert_health_literacy"

with open(DATA_PATH, 'r') as f:
    raw_data = json.load(f)

# 2. Dataset Class
class HealthLiteracyDataset(Dataset):
    def __init__(self, data, tokenizer, label_encoder, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.label_encoder = label_encoder
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        entry = self.data[item]
        
        # We concatenate fulltext and diff_label_texts
        # DistilBERT handles pair sequences well
        encoding = self.tokenizer.encode_plus(
            entry["fulltext"],
            entry["diff_label_texts"],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_overflowing_tokens=False,
            return_attention_mask=True,
            return_tensors='pt',
        )

        label = self.label_encoder.transform([entry["label"]])[0]

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 3. Setup Components
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
label_encoder = LabelEncoder()
all_labels = [d['label'] for d in raw_data]
label_encoder.fit(all_labels)
num_labels = len(label_encoder.classes_)

dataset = HealthLiteracyDataset(raw_data, tokenizer, label_encoder, MAX_LEN)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# 4. Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model.to(device)

# 5. Training Loop (Simplified)
optimizer = AdamW(model.parameters(), lr=2e-5)

model.train()
for epoch in range(EPOCHS):
    for batch in loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
    print(f"Epoch {epoch + 1} complete. Loss: {loss.item():.4f}")

# 6. Save Model, Tokenizer, and Label Encoder
os.makedirs(SAVE_DIR, exist_ok=True)
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
with open(os.path.join(SAVE_DIR, "label_encoder_classes.json"), "w") as f:
    json.dump(label_encoder.classes_.tolist(), f, indent=2)