Upload RoBERTa Clickbait Classifier

Browse files

Files changed (6) hide show

README.md +84 -0
config.json +37 -0
model.safetensors +3 -0
tokenizer.json +0 -0
tokenizer_config.json +16 -0
training_results.json +83 -0

README.md ADDED Viewed

	@@ -0,0 +1,84 @@

+---
+language:
+- en
+license: apache-2.0
+library_name: transformers
+tags:
+- text-classification
+- roberta
+- clickbait
+- clickbait-detection
+- moderation
+- content-moderation
+datasets:
+- christinacdl/Clickbait_New
+- marksverdhei/clickbait_title_classification
+- contemmcm/clickbait
+metrics:
+- accuracy
+- f1
+- precision
+- recall
+pipeline_tag: text-classification
+---
+# 🎯 RoBERTa Clickbait Classifier
+A clickbait detection model built on **RoBERTa-base** (125M parameters), fine-tuned on multiple combined and deduplicated English datasets.
+## 🚀 Quick Start
+```python
+from transformers import pipeline
+classifier = pipeline("text-classification", model="ENTUM-AI/roberta-clickbait-classifier")
+# Clickbait
+result = classifier("You Won't BELIEVE What This Celebrity Did Next!")
+print(result)  # [{'label': 'Clickbait', 'score': 0.99...}]
+# Non-Clickbait
+result = classifier("Federal Reserve raises interest rates by 0.25 percentage points")
+print(result)  # [{'label': 'Non-Clickbait', 'score': 0.99...}]
+```
+## Model Details
+| | |
+|---|---|
+| **Architecture** | RoBERTa-base (125M parameters) |
+| **Task** | Binary text classification |
+| **Labels** | `Clickbait` (1), `Non-Clickbait` (0) |
+| **Language** | English |
+| **License** | Apache 2.0 |
+| **Max input length** | 128 tokens |
+## 📊 Training Data
+Three public English clickbait datasets, combined and deduplicated:
+| Dataset | Source |
+|---------|--------|
+| [christinacdl/Clickbait_New](https://huggingface.co/datasets/christinacdl/Clickbait_New) | 58.6K samples from multiple sources |
+| [marksverdhei/clickbait_title_classification](https://huggingface.co/datasets/marksverdhei/clickbait_title_classification) | 32K samples (Chakraborty et al., ASONAM 2016) |
+| [contemmcm/clickbait](https://huggingface.co/datasets/contemmcm/clickbait) | 26K samples |
+After deduplication and balancing: **~48K samples** (train/val/test split 85/10/5).
+## ⚙️ Training
+Fine-tuned with HuggingFace Trainer using linear LR schedule with warmup, AdamW optimizer, and early stopping on F1 score.
+## 💡 Use Cases
+- **News aggregators** — filter low-quality clickbait articles
+- **Social media** — content moderation and feed quality scoring
+- **Browser extensions** — warn users about clickbait headlines
+- **Email filters** — detect clickbait-style subject lines
+- **Content platforms** — automated content quality assessment
+## ⚠️ Limitations
+- English only
+- Optimized for short texts (headlines, titles, tweets); longer texts will be truncated to 128 tokens
+- Reflects patterns and biases present in the training data sources

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "add_cross_attention": false,
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "Non-Clickbait",
+    "1": "Clickbait"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "is_decoder": false,
+  "label2id": {
+    "Clickbait": 1,
+    "Non-Clickbait": 0
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "problem_type": "single_label_classification",
+  "tie_word_embeddings": true,
+  "transformers_version": "5.1.0",
+  "type_vocab_size": 1,
+  "use_cache": false,
+  "vocab_size": 50265
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:217e1e1259a57f18f9e5558f0a064550c55aac544a622e4990660b6d1f6bf91f
+size 498612800

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

training_results.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "model_name": "roberta-base",
+  "training_config": {
+    "max_length": 128,
+    "batch_size": 16,
+    "grad_accum_steps": 4,
+    "effective_batch_size": 64,
+    "learning_rate": 2e-05,
+    "weight_decay": 0.01,
+    "warmup_ratio": 0.1,
+    "label_smoothing": 0.0,
+    "epochs_trained": 5,
+    "max_epochs": 5,
+    "early_stopping_patience": 2,
+    "seed": 42
+  },
+  "test_metrics": {
+    "loss": 0.1989,
+    "accuracy": 0.9215,
+    "f1": 0.9197,
+    "precision": 0.9431,
+    "recall": 0.8975
+  },
+  "training_log": [
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.21930797398090363,
+      "eval_accuracy": 0.9154668860551214,
+      "eval_f1": 0.9150650960942344,
+      "eval_precision": 0.9275240888144114,
+      "eval_recall": 0.9029363784665579
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.21582643687725067,
+      "eval_accuracy": 0.9164952694364459,
+      "eval_f1": 0.9156626506024096,
+      "eval_precision": 0.9331075359864521,
+      "eval_recall": 0.8988580750407831
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.22042229771614075,
+      "eval_accuracy": 0.9127930892636775,
+      "eval_f1": 0.9140308191403081,
+      "eval_precision": 0.9088709677419354,
+      "eval_recall": 0.9192495921696574
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.2514384686946869,
+      "eval_accuracy": 0.9127930892636775,
+      "eval_f1": 0.9135752140236445,
+      "eval_precision": 0.91320293398533,
+      "eval_recall": 0.9139477977161501
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.1989288628101349,
+      "eval_accuracy": 0.9214638157894737,
+      "eval_f1": 0.919714165615805,
+      "eval_precision": 0.9431034482758621,
+      "eval_recall": 0.8974569319114027
+    }
+  ],
+  "confusion_matrix": [
+    [
+      1147,
+      66
+    ],
+    [
+      125,
+      1094
+    ]
+  ],
+  "training_time_minutes": 15.3,
+  "timestamp": "2026-03-26T11:49:50.790207",
+  "data_sizes": {
+    "train": 41332,
+    "validation": 4862,
+    "test": 2432
+  }
+}