Jupiter-Group / app.py
soroush62
Initial commit
563f4d5
# Set random seeds for reproducibility
import random
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer,
EarlyStoppingCallback
)
from transformers import TextClassificationPipeline
from sklearn.metrics import accuracy_score, f1_score
from transformers_interpret import SequenceClassificationExplainer
from transformers import pipeline
import gradio as gr
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(SEED)
USE_MPS = torch.backends.mps.is_available()
device = torch.device("mps" if USE_MPS else "cpu")
print("Using device:", device)
# Load the ag_news dataset
raw = load_dataset("SetFit/ag_news")
print(raw)
# Load BERT tokenizer
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Tokenization function
def tokenize_fn(examples):
return tokenizer(examples["text"], truncation=True, max_length=128)
cols_to_remove = [c for c in raw["train"].column_names if c not in ("label",)]
# Apply tokenization to the dataset
tokenized = raw.map(tokenize_fn, batched=True, remove_columns=cols_to_remove)
# Remove original text column to avoid issues during batching
if "text" in tokenized["train"].column_names:
tokenized = tokenized.remove_columns(["text"])
# Set dataset format to PyTorch tensors
tokenized.set_format("torch")
# Shuffle and split the training dataset to create a validation set
train_dataset = tokenized["train"].shuffle(seed=SEED)
val_split = train_dataset.train_test_split(test_size=5000, seed=SEED)
train_dataset = val_split["train"]
eval_dataset = val_split["test"]
print(train_dataset)
# Load pre-trained BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4)
# Create a data collator that dynamically pads input sequences in each batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Define a metrics computation function using scikit-learn
def compute_metrics(eval_pred):
logits, labels = eval_pred
# Convert logits to predicted class indices
preds = np.argmax(logits, axis=-1)
# Compute accuracy and F1 score using scikit-learn
acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average='macro')
return {"accuracy": acc, "f1_macro": f1}
# Define training arguments
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
save_strategy="epoch",
logging_strategy="epoch",
#report_to=[], # <- disable all integrations (no wandb, no tensorboard)
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
learning_rate=2e-5,
weight_decay=0.1,
warmup_steps=100,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
save_total_limit=3,
fp16=torch.cuda.is_available(),
dataloader_drop_last=False,
gradient_accumulation_steps=1,
seed=SEED,
)
# Create Trainer instance with early stopping
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)
# Start model training
trainer.train()
# Save the fine-tuned model
trainer.save_model('my-fine-tuned-bert')
# Save the tokenizer
tokenizer.save_pretrained('my-fine-tuned-bert')
# Load the fine-tuned model and tokenizer
new_model = AutoModelForSequenceClassification.from_pretrained('my-fine-tuned-bert')
new_tokenizer = AutoTokenizer.from_pretrained('my-fine-tuned-bert')
# Create a text classification pipeline
classifier = TextClassificationPipeline(
model=new_model,
tokenizer=new_tokenizer, )
# Define label mapping
label_mapping = {
0: 'World',
1: 'Sports',
2: 'Business',
3: 'Sci/Tech'
}
# Test the classifier on a sample sentence
sample_text = "This movie was good"
result = classifier(sample_text)
# Map the predicted label to a meaningful sentiment
mapped_result = {
'label': label_mapping[int(result[0]['label'].split('_')[1])],
'score': result[0]['score']
}
print(mapped_result)
MODEL_ID = "my-fine-tuned-bert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
explainer = SequenceClassificationExplainer(model=model, tokenizer=tokenizer)
label_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
device = 0 if torch.cuda.is_available() else -1
clf = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=None, device=device)
def predict(text: str):
text = (text or "").strip()
if not text:
return {}
out = clf(text, truncation=True)
if isinstance(out, list) and isinstance(out[0], list):
out = out[0]
results = {}
for o in sorted(out, key=lambda x: -x["score"]):
idx = int(o["label"].split("_")[1])
results[label_names[idx]] = float(o["score"])
return results
# Build script-free HTML so it renders in Gradio pages
def explain_html(text: str) -> str:
text = (text or "").strip()
if not text:
return "<i>Enter text to see highlighted words.</i>"
atts = explainer(text)
toks = [t for t, _ in atts]
scores = np.abs([s for _, s in atts])
smin, smax = float(np.min(scores)), float(np.max(scores))
scores = (scores - smin) / (smax - smin + 1e-8)
spans = [
f"<span style='background: rgba(255,0,0,{0.15+0.85*s:.2f});"
f"padding:2px 3px; margin:1px; border-radius:4px; display:inline-block'>{tok}</span>"
for tok, s in zip(toks, scores)
]
return "<div style='line-height:2'>" + " ".join(spans) + "</div>"
def predict_and_explain(text: str):
return predict(text), explain_html(text)
demo = gr.Interface(
fn=predict_and_explain,
inputs=gr.Textbox(lines=3, label="Enter news headline"),
outputs=[
gr.Label(num_top_classes=4, label="Predicted topic"),
gr.HTML(label="Important-word highlights"),
],
title="AG News Topic Classifier (BERT-base)",
description="Shows predicted topic and highlights words that influenced the decision."
)
if __name__ == "__main__":
demo.launch(share=True)