Spaces:
Runtime error
Runtime error
| # Set random seeds for reproducibility | |
| import random | |
| import numpy as np | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| DataCollatorWithPadding, | |
| TrainingArguments, | |
| Trainer, | |
| EarlyStoppingCallback | |
| ) | |
| from transformers import TextClassificationPipeline | |
| from sklearn.metrics import accuracy_score, f1_score | |
| from transformers_interpret import SequenceClassificationExplainer | |
| from transformers import pipeline | |
| import gradio as gr | |
| SEED = 42 | |
| random.seed(SEED) | |
| np.random.seed(SEED) | |
| torch.manual_seed(SEED) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(SEED) | |
| USE_MPS = torch.backends.mps.is_available() | |
| device = torch.device("mps" if USE_MPS else "cpu") | |
| print("Using device:", device) | |
| # Load the ag_news dataset | |
| raw = load_dataset("SetFit/ag_news") | |
| print(raw) | |
| # Load BERT tokenizer | |
| MODEL_NAME = "bert-base-uncased" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # Tokenization function | |
| def tokenize_fn(examples): | |
| return tokenizer(examples["text"], truncation=True, max_length=128) | |
| cols_to_remove = [c for c in raw["train"].column_names if c not in ("label",)] | |
| # Apply tokenization to the dataset | |
| tokenized = raw.map(tokenize_fn, batched=True, remove_columns=cols_to_remove) | |
| # Remove original text column to avoid issues during batching | |
| if "text" in tokenized["train"].column_names: | |
| tokenized = tokenized.remove_columns(["text"]) | |
| # Set dataset format to PyTorch tensors | |
| tokenized.set_format("torch") | |
| # Shuffle and split the training dataset to create a validation set | |
| train_dataset = tokenized["train"].shuffle(seed=SEED) | |
| val_split = train_dataset.train_test_split(test_size=5000, seed=SEED) | |
| train_dataset = val_split["train"] | |
| eval_dataset = val_split["test"] | |
| print(train_dataset) | |
| # Load pre-trained BERT model for sequence classification | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4) | |
| # Create a data collator that dynamically pads input sequences in each batch | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| # Define a metrics computation function using scikit-learn | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| # Convert logits to predicted class indices | |
| preds = np.argmax(logits, axis=-1) | |
| # Compute accuracy and F1 score using scikit-learn | |
| acc = accuracy_score(labels, preds) | |
| f1 = f1_score(labels, preds, average='macro') | |
| return {"accuracy": acc, "f1_macro": f1} | |
| # Define training arguments | |
| training_args = TrainingArguments( | |
| output_dir="./results", | |
| eval_strategy="epoch", | |
| save_strategy="epoch", | |
| logging_strategy="epoch", | |
| #report_to=[], # <- disable all integrations (no wandb, no tensorboard) | |
| per_device_train_batch_size=8, | |
| per_device_eval_batch_size=8, | |
| num_train_epochs=3, | |
| learning_rate=2e-5, | |
| weight_decay=0.1, | |
| warmup_steps=100, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| save_total_limit=3, | |
| fp16=torch.cuda.is_available(), | |
| dataloader_drop_last=False, | |
| gradient_accumulation_steps=1, | |
| seed=SEED, | |
| ) | |
| # Create Trainer instance with early stopping | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| tokenizer=tokenizer, | |
| data_collator=data_collator, | |
| compute_metrics=compute_metrics, | |
| callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], | |
| ) | |
| # Start model training | |
| trainer.train() | |
| # Save the fine-tuned model | |
| trainer.save_model('my-fine-tuned-bert') | |
| # Save the tokenizer | |
| tokenizer.save_pretrained('my-fine-tuned-bert') | |
| # Load the fine-tuned model and tokenizer | |
| new_model = AutoModelForSequenceClassification.from_pretrained('my-fine-tuned-bert') | |
| new_tokenizer = AutoTokenizer.from_pretrained('my-fine-tuned-bert') | |
| # Create a text classification pipeline | |
| classifier = TextClassificationPipeline( | |
| model=new_model, | |
| tokenizer=new_tokenizer, ) | |
| # Define label mapping | |
| label_mapping = { | |
| 0: 'World', | |
| 1: 'Sports', | |
| 2: 'Business', | |
| 3: 'Sci/Tech' | |
| } | |
| # Test the classifier on a sample sentence | |
| sample_text = "This movie was good" | |
| result = classifier(sample_text) | |
| # Map the predicted label to a meaningful sentiment | |
| mapped_result = { | |
| 'label': label_mapping[int(result[0]['label'].split('_')[1])], | |
| 'score': result[0]['score'] | |
| } | |
| print(mapped_result) | |
| MODEL_ID = "my-fine-tuned-bert" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) | |
| explainer = SequenceClassificationExplainer(model=model, tokenizer=tokenizer) | |
| label_names = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"} | |
| device = 0 if torch.cuda.is_available() else -1 | |
| clf = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=None, device=device) | |
| def predict(text: str): | |
| text = (text or "").strip() | |
| if not text: | |
| return {} | |
| out = clf(text, truncation=True) | |
| if isinstance(out, list) and isinstance(out[0], list): | |
| out = out[0] | |
| results = {} | |
| for o in sorted(out, key=lambda x: -x["score"]): | |
| idx = int(o["label"].split("_")[1]) | |
| results[label_names[idx]] = float(o["score"]) | |
| return results | |
| # Build script-free HTML so it renders in Gradio pages | |
| def explain_html(text: str) -> str: | |
| text = (text or "").strip() | |
| if not text: | |
| return "<i>Enter text to see highlighted words.</i>" | |
| atts = explainer(text) | |
| toks = [t for t, _ in atts] | |
| scores = np.abs([s for _, s in atts]) | |
| smin, smax = float(np.min(scores)), float(np.max(scores)) | |
| scores = (scores - smin) / (smax - smin + 1e-8) | |
| spans = [ | |
| f"<span style='background: rgba(255,0,0,{0.15+0.85*s:.2f});" | |
| f"padding:2px 3px; margin:1px; border-radius:4px; display:inline-block'>{tok}</span>" | |
| for tok, s in zip(toks, scores) | |
| ] | |
| return "<div style='line-height:2'>" + " ".join(spans) + "</div>" | |
| def predict_and_explain(text: str): | |
| return predict(text), explain_html(text) | |
| demo = gr.Interface( | |
| fn=predict_and_explain, | |
| inputs=gr.Textbox(lines=3, label="Enter news headline"), | |
| outputs=[ | |
| gr.Label(num_top_classes=4, label="Predicted topic"), | |
| gr.HTML(label="Important-word highlights"), | |
| ], | |
| title="AG News Topic Classifier (BERT-base)", | |
| description="Shows predicted topic and highlights words that influenced the decision." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |