Upload 10 files

Browse files

Files changed (11) hide show

.gitattributes +1 -0
README.md +97 -3
classifier.pt +3 -0
classifier_config.json +6 -0
config.json +33 -0
how_to_load.py +44 -0
model.safetensors +3 -0
sentencepiece.bpe.model +3 -0
special_tokens_map.json +51 -0
tokenizer.json +3 -0
tokenizer_config.json +55 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,97 @@
----
-license: mit
----

+---
+language: bn
+tags:
+- hate-speech-detection
+- bangla
+- bert
+- binary-classification
+license: mit
+---
+# Bangla Hate Speech Detection Model
+This model is fine-tuned for binary hate speech detection in Bangla text.
+## Model Description
+- **Base Model**: AnnNaserNabil/BaseModelXLMRoberta
+- **Task**: Binary Classification (Hate Speech vs Non-Hate Speech)
+- **Language**: Bangla (Bengali)
+- **Training Method**: Baseline → Knowledge Distillation
+## Training Details
+### Training Hyperparameters
+- **Batch Size**: 32
+- **Learning Rate**: 2e-05
+- **Epochs**: 15
+- **Max Sequence Length**: 128
+- **Dropout**: 0.1
+- **Weight Decay**: 0.01
+- **Warmup Ratio**: 0.1
+### Training Data
+- **K-Fold Cross-Validation**: 5 folds
+- **Stratification**: binary
+## Performance
+*Add your metrics here after training*
+## Usage
+```python
+from transformers import AutoModel, AutoTokenizer
+import torch
+import torch.nn as nn
+import json
+# Load model components
+encoder = AutoModel.from_pretrained("path/to/model")
+with open("path/to/model/classifier_config.json", 'r') as f:
+    c_config = json.load(f)
+classifier = nn.Sequential(
+    nn.Linear(c_config['hidden_size'], 256),
+    nn.ReLU(),
+    nn.Dropout(0.1),
+    nn.Linear(256, c_config['num_labels'])
+)
+classifier.load_state_dict(torch.load("path/to/model/classifier.pt"))
+tokenizer = AutoTokenizer.from_pretrained("path/to/model")
+# Predict
+def predict(text):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
+    with torch.no_grad():
+        outputs = encoder(**inputs)
+        cls_embedding = outputs.last_hidden_state[:, 0, :]
+        logits = classifier(cls_embedding)
+        prob = torch.sigmoid(logits).item()
+    return prob
+text = "আপনার বাংলা টেক্সট এখানে"
+prob = predict(text)
+print(f"Hate Speech Probability: {prob:.4f}")
+```
+## Citation
+If you use this model, please cite:
+```bibtex
+@misc{bangla-hate-speech-model,
+  author = {Nabil},
+  title = {Bangla Hate Speech Detection Model},
+  year = {2026},
+  publisher = {HuggingFace},
+}
+```
+## License
+MIT License

classifier.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bce6361b56c1a55183d6e2cfca921ce23ba4d889c8dacaf1cfc352114fc7264b
+size 1053179

classifier_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "type": "sequential",
+  "layers": "Sequential(\n  (0): Linear(in_features=1024, out_features=256, bias=True)\n  (1): ReLU()\n  (2): Dropout(p=0.1, inplace=False)\n  (3): Linear(in_features=256, out_features=1, bias=True)\n)",
+  "num_labels": 1,
+  "hidden_size": 1024
+}

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "architectures": [
+    "AlbertModel"
+  ],
+  "attention_probs_dropout_prob": 0,
+  "bos_token_id": 2,
+  "classifier_dropout_prob": 0.1,
+  "down_scale_factor": 1,
+  "dtype": "float32",
+  "embedding_size": 128,
+  "eos_token_id": 3,
+  "gap_size": 0,
+  "hidden_act": "gelu_new",
+  "hidden_dropout_prob": 0,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "inner_group_num": 1,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "albert",
+  "net_structure_type": 0,
+  "num_attention_heads": 16,
+  "num_hidden_groups": 1,
+  "num_hidden_layers": 24,
+  "num_memory_blocks": 0,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "transformers_version": "4.57.1",
+  "type_vocab_size": 2,
+  "vocab_size": 32000
+}

how_to_load.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# How to load this model:
+from transformers import AutoModel, AutoTokenizer
+import torch
+import torch.nn as nn
+import json
+# Load encoder
+encoder = AutoModel.from_pretrained("./outputs/final_student")
+# Load classifier config
+with open("./outputs/final_student/classifier_config.json", 'r') as f:
+    c_config = json.load(f)
+num_labels = c_config.get('num_labels', 1)
+hidden_size = c_config.get('hidden_size', 768)
+# Reconstruct classifier
+classifier = nn.Sequential(
+    nn.Linear(hidden_size, 256),
+    nn.ReLU(),
+    nn.Dropout(0.1),
+    nn.Linear(256, num_labels)
+)
+classifier.load_state_dict(torch.load("./outputs/final_student/classifier.pt"))
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained("./outputs/final_student")
+# Inference function
+def predict(text):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
+    with torch.no_grad():
+        outputs = encoder(**inputs)
+        cls_embedding = outputs.last_hidden_state[:, 0, :]
+        logits = classifier(cls_embedding)
+        probs = torch.sigmoid(logits)
+    return probs.item()
+# Example
+text = "আপনার বাংলা টেক্সট এখানে"
+prob = predict(text)
+print(f"Hate Speech Probability: {prob:.4f}")
+print(f"Prediction: {'Hate Speech' if prob > 0.5 else 'Non-Hate Speech'}")

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f88bb3012ef176c2bd4bbeecf1f2658fb50fdf88f97e976057113b328cf215b5
+size 71762976

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
+size 5069051

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20
+size 17082734

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250001": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}