AshiniR commited on
Commit
c15b555
·
verified ·
1 Parent(s): 31ac22b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +66 -0
README.md CHANGED
@@ -136,6 +136,9 @@ Optimized with **Optuna (15 trials)** across ranges:
136
  ## Usage
137
 
138
  ```python
 
 
 
139
  from transformers import RobertaTokenizer, RobertaForSequenceClassification
140
  import torch
141
 
@@ -144,9 +147,72 @@ model = RobertaForSequenceClassification.from_pretrained("AshiniR/hate-speech-an
144
  tokenizer = RobertaTokenizer.from_pretrained("AshiniR/hate-speech-and-offensive-message-classifier")
145
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
146
  model.to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  def get_inference(text: str) -> list:
149
  """Returns prediction results in [{'label': str, 'score': float}, ...] format."""
 
 
 
150
  # Tokenize input text
151
  inputs = tokenizer(
152
  text,
 
136
  ## Usage
137
 
138
  ```python
139
+ import re
140
+ import html
141
+ import contractions
142
  from transformers import RobertaTokenizer, RobertaForSequenceClassification
143
  import torch
144
 
 
147
  tokenizer = RobertaTokenizer.from_pretrained("AshiniR/hate-speech-and-offensive-message-classifier")
148
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
149
  model.to(device)
150
+ model.eval()
151
+
152
+ def preprocess_text(text: str) -> str:
153
+ """
154
+ Preprocess raw text for transformer-based models like RoBERTa.
155
+
156
+ This function is tailored for toxicity, sentiment, and social media classification.
157
+ It removes noise (URLs, mentions, HTML codes) but keeps important signals
158
+ such as casing, punctuation, and emojis.
159
+
160
+ Steps:
161
+ 1. Decode HTML entities (e.g., '>' → '>')
162
+ 2. Replace URLs with placeholders ("")
163
+ 3. Replace mentions with placeholders ("")
164
+ 4. Remove '#' from hashtags but keep the word (e.g., "#love" → "love")
165
+ 5. Expand contractions (e.g., "you're" → "you are")
166
+ 6. Mildly normalize repeated characters (3+ → 2)
167
+ 7. Remove "RT" only if at start of tweet
168
+ 8. Normalize whitespace
169
+
170
+ Args:
171
+ text (str): Raw tweet text.
172
+
173
+ Returns:
174
+ str: Cleaned text suitable for RoBERTa tokenization.
175
+ """
176
+ if not isinstance(text, str):
177
+ return ""
178
+
179
+ # 1. Decode HTML entities
180
+ text = html.unescape(text)
181
+
182
+ # 2. Replace URLs with placeholder
183
+ text = re.sub(r"(https?://\S+|www\.\S+)", "", text)
184
+
185
+ # 3. Replace user mentions with placeholder
186
+ text = re.sub(r"@\w+", "", text)
187
+
188
+ # 4. Simplify hashtags
189
+ text = re.sub(r"#(\w+)", r"\1", text)
190
+
191
+ # 5. Expand contractions
192
+ text = contractions.fix(text)
193
+
194
+ # 6. Mild normalization of character elongations (3+ → 2)
195
+ text = re.sub(r"(.)\1{2,}", r"\1\1", text)
196
+
197
+ # 7. Remove RT only if it starts the tweet (For tweets)
198
+ text = re.sub(
199
+ r"^[\s\W]*rt\s*@?\w*:?[\s-]*",
200
+ "",
201
+ text,
202
+ flags=re.IGNORECASE
203
+ )
204
+
205
+ # 8. Normalize whitespace
206
+ text = re.sub(r"\s+", " ", text).strip()
207
+
208
+ return text
209
+
210
 
211
  def get_inference(text: str) -> list:
212
  """Returns prediction results in [{'label': str, 'score': float}, ...] format."""
213
+ # Preprocess the text
214
+ text = preprocess_text(text)
215
+
216
  # Tokenize input text
217
  inputs = tokenizer(
218
  text,