File size: 2,306 Bytes
5c0eadd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import streamlit as st
# Load a pre-trained version of ClinicalGPT
model = AutoModelForCausalLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# Tokenize your clinical text data using the AutoTokenizer class
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# Convert your tokenized data into PyTorch tensors and create a PyTorch Dataset object
import torch
from torch.utils.data import Dataset
    
    class ClinicalDataset(Dataset):
         def __init__(self, texts, labels, tokenizer):
             self.texts = texts
             self.labels = labels
             self.tokenizer = tokenizer
     
         def __len__(self):
             return len(self.texts)
     
         def __getitem__(self, idx):
             text = self.texts[idx]
             label = self.labels[idx]
             encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
             return {"input_ids": encoding["input_ids"].squeeze(), "attention_mask": encoding["attention_mask"].squeeze(), "labels": torch.tensor(label)}
     
     dataset = ClinicalDataset(texts=train_texts, labels=train_labels, tokenizer=tokenizer)
# Fine-tune the pre-trained model on your clinical dataset
from transformers import Trainer, TrainingArguments
    
    training_args = TrainingArguments(
         output_dir='./results',          # output directory
         num_train_epochs=3,              # total number of training epochs
         per_device_train_batch_size=16,  # batch size per device during training
         per_device_eval_batch_size=64,   # batch size for evaluation
         warmup_steps=500,                # number of warmup steps for learning rate scheduler
         weight_decay=0.01,               # strength of weight decay
         logging_dir='./logs',            # directory for storing logs
         logging_steps=10, )
    
    trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=dataset,
      eval_dataset=val_dataset,
      data_collator=lambda data: {'input_ids': torch.stack([f['input_ids'] for f in data]),
                                'attention_mask': torch.stack([f['attention_mask'] for f in data]),
                                 'labels': torch.stack([f['labels'] for f in data])}, )
    trainer.train()