# Set random seeds for reproducibility import random import numpy as np import torch from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback ) from transformers import TextClassificationPipeline from sklearn.metrics import accuracy_score, f1_score from transformers_interpret import SequenceClassificationExplainer from transformers import pipeline import gradio as gr SEED = 42 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED) USE_MPS = torch.backends.mps.is_available() device = torch.device("mps" if USE_MPS else "cpu") print("Using device:", device) # Load the ag_news dataset raw = load_dataset("SetFit/ag_news") print(raw) # Load BERT tokenizer MODEL_NAME = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Tokenization function def tokenize_fn(examples): return tokenizer(examples["text"], truncation=True, max_length=128) cols_to_remove = [c for c in raw["train"].column_names if c not in ("label",)] # Apply tokenization to the dataset tokenized = raw.map(tokenize_fn, batched=True, remove_columns=cols_to_remove) # Remove original text column to avoid issues during batching if "text" in tokenized["train"].column_names: tokenized = tokenized.remove_columns(["text"]) # Set dataset format to PyTorch tensors tokenized.set_format("torch") # Shuffle and split the training dataset to create a validation set train_dataset = tokenized["train"].shuffle(seed=SEED) val_split = train_dataset.train_test_split(test_size=5000, seed=SEED) train_dataset = val_split["train"] eval_dataset = val_split["test"] print(train_dataset) # Load pre-trained BERT model for sequence classification model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4) # Create a data collator that dynamically pads input sequences in each batch data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Define a metrics computation function using scikit-learn def compute_metrics(eval_pred): logits, labels = eval_pred # Convert logits to predicted class indices preds = np.argmax(logits, axis=-1) # Compute accuracy and F1 score using scikit-learn acc = accuracy_score(labels, preds) f1 = f1_score(labels, preds, average='macro') return {"accuracy": acc, "f1_macro": f1} # Define training arguments training_args = TrainingArguments( output_dir="./results", eval_strategy="epoch", save_strategy="epoch", logging_strategy="epoch", #report_to=[], # <- disable all integrations (no wandb, no tensorboard) per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, learning_rate=2e-5, weight_decay=0.1, warmup_steps=100, load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False, save_total_limit=3, fp16=torch.cuda.is_available(), dataloader_drop_last=False, gradient_accumulation_steps=1, seed=SEED, ) # Create Trainer instance with early stopping trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], ) # Start model training trainer.train() # Save the fine-tuned model trainer.save_model('my-fine-tuned-bert') # Save the tokenizer tokenizer.save_pretrained('my-fine-tuned-bert') # Load the fine-tuned model and tokenizer new_model = AutoModelForSequenceClassification.from_pretrained('my-fine-tuned-bert') new_tokenizer = AutoTokenizer.from_pretrained('my-fine-tuned-bert') # Create a text classification pipeline classifier = TextClassificationPipeline( model=new_model, tokenizer=new_tokenizer, ) # Define label mapping label_mapping = { 0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech' } # Test the classifier on a sample sentence sample_text = "This movie was good" result = classifier(sample_text) # Map the predicted label to a meaningful sentiment mapped_result = { 'label': label_mapping[int(result[0]['label'].split('_')[1])], 'score': result[0]['score'] } print(mapped_result) MODEL_ID = "my-fine-tuned-bert" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) explainer = SequenceClassificationExplainer(model=model, tokenizer=tokenizer) label_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"} device = 0 if torch.cuda.is_available() else -1 clf = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=None, device=device) def predict(text: str): text = (text or "").strip() if not text: return {} out = clf(text, truncation=True) if isinstance(out, list) and isinstance(out[0], list): out = out[0] results = {} for o in sorted(out, key=lambda x: -x["score"]): idx = int(o["label"].split("_")[1]) results[label_names[idx]] = float(o["score"]) return results # Build script-free HTML so it renders in Gradio pages def explain_html(text: str) -> str: text = (text or "").strip() if not text: return "Enter text to see highlighted words." atts = explainer(text) toks = [t for t, _ in atts] scores = np.abs([s for _, s in atts]) smin, smax = float(np.min(scores)), float(np.max(scores)) scores = (scores - smin) / (smax - smin + 1e-8) spans = [ f"{tok}" for tok, s in zip(toks, scores) ] return "
" + " ".join(spans) + "
" def predict_and_explain(text: str): return predict(text), explain_html(text) demo = gr.Interface( fn=predict_and_explain, inputs=gr.Textbox(lines=3, label="Enter news headline"), outputs=[ gr.Label(num_top_classes=4, label="Predicted topic"), gr.HTML(label="Important-word highlights"), ], title="AG News Topic Classifier (BERT-base)", description="Shows predicted topic and highlights words that influenced the decision." ) if __name__ == "__main__": demo.launch(share=True)