SurgerySort / app.py
noequal's picture
Create app.py
5c0eadd
raw
history blame
2.31 kB
import streamlit as st
# Load a pre-trained version of ClinicalGPT
model = AutoModelForCausalLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# Tokenize your clinical text data using the AutoTokenizer class
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# Convert your tokenized data into PyTorch tensors and create a PyTorch Dataset object
import torch
from torch.utils.data import Dataset
class ClinicalDataset(Dataset):
def __init__(self, texts, labels, tokenizer):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
return {"input_ids": encoding["input_ids"].squeeze(), "attention_mask": encoding["attention_mask"].squeeze(), "labels": torch.tensor(label)}
dataset = ClinicalDataset(texts=train_texts, labels=train_labels, tokenizer=tokenizer)
# Fine-tune the pre-trained model on your clinical dataset
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10, )
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
eval_dataset=val_dataset,
data_collator=lambda data: {'input_ids': torch.stack([f['input_ids'] for f in data]),
'attention_mask': torch.stack([f['attention_mask'] for f in data]),
'labels': torch.stack([f['labels'] for f in data])}, )
trainer.train()