Update README.md
Browse files
README.md
CHANGED
|
@@ -136,6 +136,9 @@ Optimized with **Optuna (15 trials)** across ranges:
|
|
| 136 |
## Usage
|
| 137 |
|
| 138 |
```python
|
|
|
|
|
|
|
|
|
|
| 139 |
from transformers import RobertaTokenizer, RobertaForSequenceClassification
|
| 140 |
import torch
|
| 141 |
|
|
@@ -144,9 +147,72 @@ model = RobertaForSequenceClassification.from_pretrained("AshiniR/hate-speech-an
|
|
| 144 |
tokenizer = RobertaTokenizer.from_pretrained("AshiniR/hate-speech-and-offensive-message-classifier")
|
| 145 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 146 |
model.to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
def get_inference(text: str) -> list:
|
| 149 |
"""Returns prediction results in [{'label': str, 'score': float}, ...] format."""
|
|
|
|
|
|
|
|
|
|
| 150 |
# Tokenize input text
|
| 151 |
inputs = tokenizer(
|
| 152 |
text,
|
|
|
|
| 136 |
## Usage
|
| 137 |
|
| 138 |
```python
|
| 139 |
+
import re
|
| 140 |
+
import html
|
| 141 |
+
import contractions
|
| 142 |
from transformers import RobertaTokenizer, RobertaForSequenceClassification
|
| 143 |
import torch
|
| 144 |
|
|
|
|
| 147 |
tokenizer = RobertaTokenizer.from_pretrained("AshiniR/hate-speech-and-offensive-message-classifier")
|
| 148 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 149 |
model.to(device)
|
| 150 |
+
model.eval()
|
| 151 |
+
|
| 152 |
+
def preprocess_text(text: str) -> str:
|
| 153 |
+
"""
|
| 154 |
+
Preprocess raw text for transformer-based models like RoBERTa.
|
| 155 |
+
|
| 156 |
+
This function is tailored for toxicity, sentiment, and social media classification.
|
| 157 |
+
It removes noise (URLs, mentions, HTML codes) but keeps important signals
|
| 158 |
+
such as casing, punctuation, and emojis.
|
| 159 |
+
|
| 160 |
+
Steps:
|
| 161 |
+
1. Decode HTML entities (e.g., '>' → '>')
|
| 162 |
+
2. Replace URLs with placeholders ("")
|
| 163 |
+
3. Replace mentions with placeholders ("")
|
| 164 |
+
4. Remove '#' from hashtags but keep the word (e.g., "#love" → "love")
|
| 165 |
+
5. Expand contractions (e.g., "you're" → "you are")
|
| 166 |
+
6. Mildly normalize repeated characters (3+ → 2)
|
| 167 |
+
7. Remove "RT" only if at start of tweet
|
| 168 |
+
8. Normalize whitespace
|
| 169 |
+
|
| 170 |
+
Args:
|
| 171 |
+
text (str): Raw tweet text.
|
| 172 |
+
|
| 173 |
+
Returns:
|
| 174 |
+
str: Cleaned text suitable for RoBERTa tokenization.
|
| 175 |
+
"""
|
| 176 |
+
if not isinstance(text, str):
|
| 177 |
+
return ""
|
| 178 |
+
|
| 179 |
+
# 1. Decode HTML entities
|
| 180 |
+
text = html.unescape(text)
|
| 181 |
+
|
| 182 |
+
# 2. Replace URLs with placeholder
|
| 183 |
+
text = re.sub(r"(https?://\S+|www\.\S+)", "", text)
|
| 184 |
+
|
| 185 |
+
# 3. Replace user mentions with placeholder
|
| 186 |
+
text = re.sub(r"@\w+", "", text)
|
| 187 |
+
|
| 188 |
+
# 4. Simplify hashtags
|
| 189 |
+
text = re.sub(r"#(\w+)", r"\1", text)
|
| 190 |
+
|
| 191 |
+
# 5. Expand contractions
|
| 192 |
+
text = contractions.fix(text)
|
| 193 |
+
|
| 194 |
+
# 6. Mild normalization of character elongations (3+ → 2)
|
| 195 |
+
text = re.sub(r"(.)\1{2,}", r"\1\1", text)
|
| 196 |
+
|
| 197 |
+
# 7. Remove RT only if it starts the tweet (For tweets)
|
| 198 |
+
text = re.sub(
|
| 199 |
+
r"^[\s\W]*rt\s*@?\w*:?[\s-]*",
|
| 200 |
+
"",
|
| 201 |
+
text,
|
| 202 |
+
flags=re.IGNORECASE
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# 8. Normalize whitespace
|
| 206 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 207 |
+
|
| 208 |
+
return text
|
| 209 |
+
|
| 210 |
|
| 211 |
def get_inference(text: str) -> list:
|
| 212 |
"""Returns prediction results in [{'label': str, 'score': float}, ...] format."""
|
| 213 |
+
# Preprocess the text
|
| 214 |
+
text = preprocess_text(text)
|
| 215 |
+
|
| 216 |
# Tokenize input text
|
| 217 |
inputs = tokenizer(
|
| 218 |
text,
|