AshiniR
/

hate-speech-and-offensive-message-classifier

Text Classification

hate-speech-and-offensive-message-detection

Model card Files Files and versions

AshiniR commited on Nov 2, 2025

Commit

c15b555

·

verified ·

1 Parent(s): 31ac22b

Update README.md

Files changed (1) hide show

README.md +66 -0

README.md CHANGED Viewed

@@ -136,6 +136,9 @@ Optimized with **Optuna (15 trials)** across ranges:
 ## Usage
 ```python
 from transformers import RobertaTokenizer, RobertaForSequenceClassification
 import torch
@@ -144,9 +147,72 @@ model = RobertaForSequenceClassification.from_pretrained("AshiniR/hate-speech-an
 tokenizer = RobertaTokenizer.from_pretrained("AshiniR/hate-speech-and-offensive-message-classifier")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 def get_inference(text: str) -> list:
     """Returns prediction results in [{'label': str, 'score': float}, ...] format."""
     # Tokenize input text
     inputs = tokenizer(
         text,

 ## Usage
 ```python
+import re
+import html
+import contractions
 from transformers import RobertaTokenizer, RobertaForSequenceClassification
 import torch
 tokenizer = RobertaTokenizer.from_pretrained("AshiniR/hate-speech-and-offensive-message-classifier")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+model.eval()
+def preprocess_text(text: str) -> str:
+    """
+    Preprocess raw text for transformer-based models like RoBERTa.
+    This function is tailored for toxicity, sentiment, and social media classification.
+    It removes noise (URLs, mentions, HTML codes) but keeps important signals
+    such as casing, punctuation, and emojis.
+    Steps:
+        1. Decode HTML entities (e.g., '>' → '>')
+        2. Replace URLs with placeholders ("")
+        3. Replace mentions with placeholders ("")
+        4. Remove '#' from hashtags but keep the word (e.g., "#love" → "love")
+        5. Expand contractions (e.g., "you're" → "you are")
+        6. Mildly normalize repeated characters (3+ → 2)
+        7. Remove "RT" only if at start of tweet
+        8. Normalize whitespace
+    Args:
+        text (str): Raw tweet text.
+    Returns:
+        str: Cleaned text suitable for RoBERTa tokenization.
+    """
+    if not isinstance(text, str):
+        return ""
+    # 1. Decode HTML entities
+    text = html.unescape(text)
+    # 2. Replace URLs with placeholder
+    text = re.sub(r"(https?://\S+|www\.\S+)", "", text)
+    # 3. Replace user mentions with placeholder
+    text = re.sub(r"@\w+", "", text)
+    # 4. Simplify hashtags
+    text = re.sub(r"#(\w+)", r"\1", text)
+    # 5. Expand contractions
+    text = contractions.fix(text)
+    # 6. Mild normalization of character elongations (3+ → 2)
+    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
+    # 7. Remove RT only if it starts the tweet (For tweets)
+    text = re.sub(
+        r"^[\s\W]*rt\s*@?\w*:?[\s-]*",
+        "",
+        text,
+        flags=re.IGNORECASE
+    )
+    # 8. Normalize whitespace
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
 def get_inference(text: str) -> list:
     """Returns prediction results in [{'label': str, 'score': float}, ...] format."""
+    # Preprocess the text
+    text = preprocess_text(text)
     # Tokenize input text
     inputs = tokenizer(
         text,