import json import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "2" import torch from torch.optim import AdamW from torch.utils.data import Dataset, DataLoader from transformers import DistilBertTokenizer, DistilBertForSequenceClassification from sklearn.preprocessing import LabelEncoder # 1. Configuration & Data Loading DATA_PATH = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json" MODEL_NAME = "distilbert-base-uncased" MAX_LEN = 512 BATCH_SIZE = 8 EPOCHS = 3 SAVE_DIR = "/home/mshahidul/readctrl/code/text_classifier/distilbert_health_literacy" with open(DATA_PATH, 'r') as f: raw_data = json.load(f) # 2. Dataset Class class HealthLiteracyDataset(Dataset): def __init__(self, data, tokenizer, label_encoder, max_len): self.data = data self.tokenizer = tokenizer self.label_encoder = label_encoder self.max_len = max_len def __len__(self): return len(self.data) def __getitem__(self, item): entry = self.data[item] # We concatenate fulltext and diff_label_texts # DistilBERT handles pair sequences well encoding = self.tokenizer.encode_plus( entry["fulltext"], entry["diff_label_texts"], add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True, return_overflowing_tokens=False, return_attention_mask=True, return_tensors='pt', ) label = self.label_encoder.transform([entry["label"]])[0] return { 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.long) } # 3. Setup Components tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME) label_encoder = LabelEncoder() all_labels = [d['label'] for d in raw_data] label_encoder.fit(all_labels) num_labels = len(label_encoder.classes_) dataset = HealthLiteracyDataset(raw_data, tokenizer, label_encoder, MAX_LEN) loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True) # 4. Initialize Model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels) model.to(device) # 5. Training Loop (Simplified) optimizer = AdamW(model.parameters(), lr=2e-5) model.train() for epoch in range(EPOCHS): for batch in loader: optimizer.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = outputs.loss loss.backward() optimizer.step() print(f"Epoch {epoch + 1} complete. Loss: {loss.item():.4f}") # 6. Save Model, Tokenizer, and Label Encoder os.makedirs(SAVE_DIR, exist_ok=True) model.save_pretrained(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR) with open(os.path.join(SAVE_DIR, "label_encoder_classes.json"), "w") as f: json.dump(label_encoder.classes_.tolist(), f, indent=2)