Upload 10 files

Browse files

Files changed (10) hide show

README.md +97 -3
classifier.pt +3 -0
classifier_config.json +6 -0
config.json +24 -0
how_to_load.py +44 -0
model.safetensors +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
vocab.txt +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,97 @@
----
-license: mit
----

+---
+language: bn
+tags:
+- hate-speech-detection
+- bangla
+- bert
+- binary-classification
+license: mit
+---
+# Bangla Hate Speech Detection Model
+This model is fine-tuned for binary hate speech detection in Bangla text.
+## Model Description
+- **Base Model**: md-nishat-008/Mixed-Distil-BERT
+- **Task**: Binary Classification (Hate Speech vs Non-Hate Speech)
+- **Language**: Bangla (Bengali)
+- **Training Method**: Baseline training only (original behavior)
+## Training Details
+### Training Hyperparameters
+- **Batch Size**: 16
+- **Learning Rate**: 1e-05
+- **Epochs**: 10
+- **Max Sequence Length**: 128
+- **Dropout**: 0.1
+- **Weight Decay**: 0.01
+- **Warmup Ratio**: 0.1
+### Training Data
+- **K-Fold Cross-Validation**: 5 folds
+- **Stratification**: binary
+## Performance
+*Add your metrics here after training*
+## Usage
+```python
+from transformers import AutoModel, AutoTokenizer
+import torch
+import torch.nn as nn
+import json
+# Load model components
+encoder = AutoModel.from_pretrained("path/to/model")
+with open("path/to/model/classifier_config.json", 'r') as f:
+    c_config = json.load(f)
+classifier = nn.Sequential(
+    nn.Linear(c_config['hidden_size'], 256),
+    nn.ReLU(),
+    nn.Dropout(0.1),
+    nn.Linear(256, c_config['num_labels'])
+)
+classifier.load_state_dict(torch.load("path/to/model/classifier.pt"))
+tokenizer = AutoTokenizer.from_pretrained("path/to/model")
+# Predict
+def predict(text):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
+    with torch.no_grad():
+        outputs = encoder(**inputs)
+        cls_embedding = outputs.last_hidden_state[:, 0, :]
+        logits = classifier(cls_embedding)
+        prob = torch.sigmoid(logits).item()
+    return prob
+text = "আপনার বাংলা টেক্সট এখানে"
+prob = predict(text)
+print(f"Hate Speech Probability: {prob:.4f}")
+```
+## Citation
+If you use this model, please cite:
+```bibtex
+@misc{bangla-hate-speech-model,
+  author = {Nabil},
+  title = {Bangla Hate Speech Detection Model},
+  year = {2026},
+  publisher = {HuggingFace},
+}
+```
+## License
+MIT License

classifier.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5fd4d1a658194d203ce2a06dc79777d53f6f4dcf35d909bcb987857cbe6bc43
+size 790568

classifier_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "type": "sequential",
+  "layers": "Sequential(\n  (0): Linear(in_features=768, out_features=256, bias=True)\n  (1): ReLU()\n  (2): Dropout(p=0.1, inplace=False)\n  (3): Linear(in_features=256, out_features=1, bias=True)\n)",
+  "num_labels": 1,
+  "hidden_size": 768
+}

config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertModel"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "output_hidden_states": true,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
+  "vocab_size": 30522
+}

how_to_load.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# How to load this model:
+from transformers import AutoModel, AutoTokenizer
+import torch
+import torch.nn as nn
+import json
+# Load encoder
+encoder = AutoModel.from_pretrained("./outputs/final_baseline_best")
+# Load classifier config
+with open("./outputs/final_baseline_best/classifier_config.json", 'r') as f:
+    c_config = json.load(f)
+num_labels = c_config.get('num_labels', 1)
+hidden_size = c_config.get('hidden_size', 768)
+# Reconstruct classifier
+classifier = nn.Sequential(
+    nn.Linear(hidden_size, 256),
+    nn.ReLU(),
+    nn.Dropout(0.1),
+    nn.Linear(256, num_labels)
+)
+classifier.load_state_dict(torch.load("./outputs/final_baseline_best/classifier.pt"))
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("./outputs/final_baseline_best")
+# Inference function
+def predict(text):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
+    with torch.no_grad():
+        outputs = encoder(**inputs)
+        cls_embedding = outputs.last_hidden_state[:, 0, :]
+        logits = classifier(cls_embedding)
+        probs = torch.sigmoid(logits)
+    return probs.item()
+# Example
+text = "আপনার বাংলা টেক্সট এখানে"
+prob = predict(text)
+print(f"Hate Speech Probability: {prob:.4f}")
+print(f"Prediction: {'Hate Speech' if prob > 0.5 else 'Non-Hate Speech'}")

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a617546ae639c094a9af5596f01afa78dec6a80b727adf1df7ce96279c65ad1
+size 265462608

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff