Lahari2005 commited on
Commit
561afe3
·
verified ·
1 Parent(s): f08a743

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +446 -0
README.md CHANGED
@@ -11,3 +11,449 @@ short_description: comment classification
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+ import gradio as gr
15
+ import pandas as pd
16
+ import numpy as np
17
+ from sklearn.feature_extraction.text import TfidfVectorizer
18
+ from sklearn.linear_model import LogisticRegression
19
+ from sklearn.model_selection import train_test_split
20
+ import random
21
+ import re
22
+
23
+ # Create synthetic dataset for toxic and non-toxic comments
24
+ def create_synthetic_dataset():
25
+ np.random.seed(42)
26
+ random.seed(42)
27
+
28
+ # Toxic comments patterns
29
+ toxic_patterns = [
30
+ "You're such a {insult} who knows nothing about {topic}.",
31
+ "Only an {insult} would think that about {topic}.",
32
+ "This is the dumbest take on {topic} I've ever seen.",
33
+ "Go back to {place}, you {insult}.",
34
+ "Why are you so {negative_adj} about everything?",
35
+ "Everyone like you should be {threat}.",
36
+ "Your opinion is worthless because you're a {insult}.",
37
+ "I hope you {threat} for saying that.",
38
+ "People like you are the reason why {bad_thing} happens.",
39
+ "Shut up, you don't know what you're talking about.",
40
+ "You're just a {insult} with no life.",
41
+ "How can anyone be this {negative_adj}?",
42
+ "I wouldn't expect anything better from a {insult}.",
43
+ "Your existence is an insult to {group}.",
44
+ "Do everyone a favor and {threat}."
45
+ ]
46
+
47
+ # Non-toxic comments patterns
48
+ non_toxic_patterns = [
49
+ "I appreciate your perspective on {topic}.",
50
+ "That's an interesting point about {topic}.",
51
+ "I see what you mean, but have you considered {alternative_view}?",
52
+ "Thanks for sharing your thoughts on {topic}.",
53
+ "I respectfully disagree because of {reason}.",
54
+ "That's a good question about {topic}.",
55
+ "I learned something new about {topic} today.",
56
+ "Could you elaborate more on your view about {topic}?",
57
+ "I never thought about it that way before.",
58
+ "You make a valid point regarding {topic}.",
59
+ "I understand where you're coming from.",
60
+ "Let's agree to disagree on this one.",
61
+ "I value different opinions on {topic}.",
62
+ "That's a fair assessment of the situation.",
63
+ "I think we have common ground on {shared_view}."
64
+ ]
65
+
66
+ # Fillers for the patterns
67
+ insults = ["idiot", "moron", "fool", "jerk", "imbecile", "buffoon", "dimwit", "simpleton", "dunce", "nitwit"]
68
+ topics = ["politics", "sports", "technology", "music", "movies", "science", "education", "health", "environment", "economy"]
69
+ negative_adjs = ["stupid", "ignorant", "pathetic", "ridiculous", "awful", "terrible", "horrible", "disgusting", "vile", "repulsive"]
70
+ places = ["your country", "where you came from", "your mom's basement", "the cave you live in", "under your rock"]
71
+ threats = ["die", "disappear", "stop talking", "leave", "get banned", "be quiet", "go away", "never return", "get lost", "vanish"]
72
+ bad_things = ["war", "famine", "disease", "poverty", "conflict", "hate", "violence", "discrimination", "suffering", "chaos"]
73
+ groups = ["humanity", "society", "this community", "intelligent people", "decent folks"]
74
+ alternative_views = ["this other aspect", "the historical context", "the data", "recent developments", "expert opinions"]
75
+ reasons = ["my experiences", "the evidence", "what I've read", "statistics", "expert analysis"]
76
+ shared_views = ["this issue", "the importance of dialogue", "seeking truth", "finding solutions", "moving forward"]
77
+
78
+ # Generate toxic comments
79
+ toxic_comments = []
80
+ for _ in range(500):
81
+ pattern = random.choice(toxic_patterns)
82
+ comment = pattern.format(
83
+ insult=random.choice(insults),
84
+ topic=random.choice(topics),
85
+ negative_adj=random.choice(negative_adjs),
86
+ place=random.choice(places),
87
+ threat=random.choice(threats),
88
+ bad_thing=random.choice(bad_things),
89
+ group=random.choice(groups)
90
+ )
91
+ toxic_comments.append((comment, 1))
92
+
93
+ # Generate non-toxic comments
94
+ non_toxic_comments = []
95
+ for _ in range(500):
96
+ pattern = random.choice(non_toxic_patterns)
97
+ comment = pattern.format(
98
+ topic=random.choice(topics),
99
+ alternative_view=random.choice(alternative_views),
100
+ reason=random.choice(reasons),
101
+ shared_view=random.choice(shared_views)
102
+ )
103
+ non_toxic_comments.append((comment, 0))
104
+
105
+ # Combine and shuffle
106
+ all_comments = toxic_comments + non_toxic_comments
107
+ random.shuffle(all_comments)
108
+
109
+ # Create DataFrame
110
+ df = pd.DataFrame(all_comments, columns=['comment', 'toxic'])
111
+ return df
112
+
113
+ # Create and train the model
114
+ def create_and_train_model(df):
115
+ # Split the data
116
+ X_train, X_test, y_train, y_test = train_test_split(
117
+ df['comment'], df['toxic'], test_size=0.2, random_state=42
118
+ )
119
+
120
+ # Vectorize the text
121
+ vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
122
+ X_train_vec = vectorizer.fit_transform(X_train)
123
+ X_test_vec = vectorizer.transform(X_test)
124
+
125
+ # Train the model
126
+ model = LogisticRegression(max_iter=1000, random_state=42)
127
+ model.fit(X_train_vec, y_train)
128
+
129
+ return model, vectorizer
130
+
131
+ # Create the synthetic dataset and train the model
132
+ df = create_synthetic_dataset()
133
+ model, vectorizer = create_and_train_model(df)
134
+
135
+ # Function to predict toxicity
136
+ def predict_toxicity(comment):
137
+ if not comment.strip():
138
+ return {"toxic": False, "toxicity_score": 0.0, "display_text": "No text provided"}
139
+
140
+ # Vectorize the comment
141
+ comment_vec = vectorizer.transform([comment])
142
+
143
+ # Predict
144
+ prediction = model.predict_proba(comment_vec)[0]
145
+ toxic_prob = prediction[1] # Probability of being toxic
146
+
147
+ # Determine if toxic
148
+ is_toxic = toxic_prob > 0.7
149
+
150
+ return {
151
+ "toxic": is_toxic,
152
+ "toxicity_score": float(toxic_prob),
153
+ "display_text": comment
154
+ }
155
+
156
+ # Function to simulate browser extension highlighting
157
+ def highlight_toxic_comments(text):
158
+ if not text.strip():
159
+ return "<div style='font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; color: #666;'>No comments to analyze</div>"
160
+
161
+ # Split into comments (assuming each line is a comment)
162
+ comments = text.split('\n')
163
+ highlighted_html = "<div style='font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto;'>"
164
+
165
+ for comment in comments:
166
+ if not comment.strip():
167
+ continue
168
+
169
+ result = predict_toxicity(comment)
170
+
171
+ if result['toxic']:
172
+ # Highlight toxic comments in red
173
+ highlighted_html += f"""
174
+ <div style='
175
+ background-color: #ffebee;
176
+ border-left: 5px solid #f44336;
177
+ padding: 12px;
178
+ margin: 10px 0;
179
+ border-radius: 4px;
180
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
181
+ '>
182
+ <div style='display: flex; justify-content: space-between; align-items: center;'>
183
+ <span style='color: #d32f2f; font-weight: bold;'>⚠️ Toxic Comment</span>
184
+ <span style='color: #888; font-size: 0.9em;'>Toxicity: {result['toxicity_score']*100:.1f}%</span>
185
+ </div>
186
+ <p style='margin: 8px 0; color: #333;'>{comment}</p>
187
+ </div>
188
+ """
189
+ else:
190
+ # Keep non-toxic comments normal
191
+ highlighted_html += f"""
192
+ <div style='
193
+ background-color: #f5f5f5;
194
+ border-left: 5px solid #4caf50;
195
+ padding: 12px;
196
+ margin: 10px 0;
197
+ border-radius: 4px;
198
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
199
+ '>
200
+ <div style='display: flex; justify-content: space-between; align-items: center;'>
201
+ <span style='color: #388e3c; font-weight: bold;'>✓ Civil Comment</span>
202
+ <span style='color: #888; font-size: 0.9em;'>Toxicity: {result['toxicity_score']*100:.1f}%</span>
203
+ </div>
204
+ <p style='margin: 8px 0; color: #333;'>{comment}</p>
205
+ </div>
206
+ """
207
+
208
+ highlighted_html += "</div>"
209
+ return highlighted_html
210
+
211
+ # Function to analyze single comment
212
+ def analyze_single_comment(comment):
213
+ if not comment.strip():
214
+ return "Please enter a comment to analyze", "white", "0%"
215
+
216
+ result = predict_toxicity(comment)
217
+
218
+ if result['toxic']:
219
+ return (
220
+ f"⚠️ This comment is classified as TOXIC with a {result['toxicity_score']*100:.1f}% probability.",
221
+ "red",
222
+ f"{result['toxicity_score']*100:.1f}%"
223
+ )
224
+ else:
225
+ return (
226
+ f"✓ This comment is CIVIL with a {result['toxicity_score']*100:.1f}% toxicity probability.",
227
+ "green",
228
+ f"{result['toxicity_score']*100:.1f}%"
229
+ )
230
+
231
+ # Create custom CSS for styling
232
+ custom_css = """
233
+ .gr-button {
234
+ background: linear-gradient(45deg, #ff6b6b, #ff8e8e) !important;
235
+ color: white !important;
236
+ border: none !important;
237
+ border-radius: 8px !important;
238
+ padding: 12px 24px !important;
239
+ font-weight: bold !important;
240
+ transition: all 0.3s ease !important;
241
+ }
242
+
243
+ .gr-button:hover {
244
+ transform: translateY(-2px);
245
+ box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important;
246
+ }
247
+
248
+ .gr-button:active {
249
+ transform: translateY(0);
250
+ }
251
+
252
+ .toxicity-meter {
253
+ background: linear-gradient(90deg, #4caf50 0%, #ffeb3b 50%, #f44336 100%);
254
+ height: 20px;
255
+ border-radius: 10px;
256
+ margin: 10px 0;
257
+ position: relative;
258
+ }
259
+
260
+ .toxicity-value {
261
+ position: absolute;
262
+ top: -25px;
263
+ font-weight: bold;
264
+ color: #333;
265
+ }
266
+
267
+ h1 {
268
+ background: linear-gradient(45deg, #ff6b6b, #ff8e8e);
269
+ -webkit-background-clip: text;
270
+ -webkit-text-fill-color: transparent;
271
+ text-align: center;
272
+ margin-bottom: 20px !important;
273
+ }
274
+
275
+ .gr-box {
276
+ border-radius: 12px !important;
277
+ border: 2px solid #e0e0e0 !important;
278
+ padding: 16px !important;
279
+ }
280
+
281
+ .gr-tab {
282
+ border-radius: 12px 12px 0 0 !important;
283
+ }
284
+
285
+ .example-container {
286
+ background: #f9f9f9;
287
+ padding: 15px;
288
+ border-radius: 12px;
289
+ margin: 10px 0;
290
+ }
291
+
292
+ .example-comment {
293
+ padding: 10px;
294
+ margin: 5px 0;
295
+ border-radius: 8px;
296
+ background: white;
297
+ cursor: pointer;
298
+ transition: all 0.2s ease;
299
+ }
300
+
301
+ .example-comment:hover {
302
+ transform: translateX(5px);
303
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
304
+ }
305
+ """
306
+
307
+ # Create Gradio interface
308
+ with gr.Blocks(title="Toxic Comment Classifier", theme=gr.themes.Soft(), css=custom_css) as demo:
309
+ gr.Markdown(
310
+ """
311
+ # 🚨 Toxic Comment Classifier
312
+
313
+ This tool identifies abusive, hateful, or toxic comments using machine learning.
314
+ It simulates how a browser extension would highlight toxic content in red.
315
+ """
316
+ )
317
+
318
+ with gr.Tab("🔍 Single Comment Analysis"):
319
+ gr.Markdown("## Analyze a Single Comment")
320
+ with gr.Row():
321
+ with gr.Column(scale=1):
322
+ input_text = gr.Textbox(
323
+ label="Enter a comment to analyze",
324
+ placeholder="Type your comment here...",
325
+ lines=3,
326
+ elem_classes="gr-box"
327
+ )
328
+ analyze_btn = gr.Button("Analyze Comment", variant="primary")
329
+
330
+ # Toxicity meter
331
+ gr.Markdown("### Toxicity Meter")
332
+ toxicity_display = gr.Label(label="Toxicity Score", value="0%")
333
+
334
+ # Visual indicator
335
+ gr.Markdown("### Visual Indicator")
336
+ color_box = gr.Textbox(
337
+ value="Enter a comment to see analysis",
338
+ interactive=False,
339
+ label="Analysis Result"
340
+ )
341
+
342
+ with gr.Column(scale=1):
343
+ # Examples for single comment
344
+ gr.Markdown("### Try These Examples")
345
+ with gr.Column(elem_classes="example-container"):
346
+ examples = [
347
+ "You're such an idiot who knows nothing about politics.",
348
+ "I appreciate your perspective on this topic.",
349
+ "People like you are the reason why we have so many problems in society.",
350
+ "That's an interesting point about the economy."
351
+ ]
352
+
353
+ for example in examples:
354
+ example_btn = gr.Button(
355
+ example,
356
+ size="sm",
357
+ variant="secondary",
358
+ elem_classes="example-comment"
359
+ )
360
+ example_btn.click(
361
+ fn=lambda e=example: e,
362
+ inputs=None,
363
+ outputs=input_text
364
+ )
365
+
366
+ with gr.Tab("🌐 Browser Extension Simulator"):
367
+ gr.Markdown("""
368
+ ## Browser Extension Simulator
369
+
370
+ Paste multiple comments (one per line) to simulate how a browser extension would highlight toxic content:
371
+ """)
372
+
373
+ with gr.Row():
374
+ with gr.Column():
375
+ multi_comments = gr.Textbox(
376
+ label="Comments (one per line)",
377
+ placeholder="Enter multiple comments here, one per line...",
378
+ lines=10,
379
+ elem_classes="gr-box"
380
+ )
381
+ analyze_multi_btn = gr.Button("Analyze Comments", variant="primary")
382
+
383
+ with gr.Column():
384
+ highlighted_output = gr.HTML(label="Highlighted Comments")
385
+
386
+ # Examples for multiple comments
387
+ gr.Markdown("### Example Comment Threads")
388
+ with gr.Row():
389
+ with gr.Column():
390
+ example1 = gr.Examples(
391
+ examples=[
392
+ """You're such an idiot who knows nothing about politics.
393
+ I appreciate your perspective on this topic.
394
+ People like you are the reason why we have so many problems in society.
395
+ That's an interesting point about the economy.
396
+ Everyone like you should be banned from this platform."""
397
+ ],
398
+ inputs=multi_comments,
399
+ label="Example 1"
400
+ )
401
+ with gr.Column():
402
+ example2 = gr.Examples(
403
+ examples=[
404
+ """This is the dumbest take on sports I've ever seen.
405
+ Thanks for sharing your thoughts on the environment.
406
+ I hope you disappear for saying that.
407
+ I see what you mean, but have you considered the historical context?"""
408
+ ],
409
+ inputs=multi_comments,
410
+ label="Example 2"
411
+ )
412
+
413
+ with gr.Tab("📘 About This Project"):
414
+ gr.Markdown("""
415
+ ## About the Toxic Comment Classifier
416
+
417
+ This project demonstrates a machine learning approach to identifying toxic comments online.
418
+
419
+ **How it works:**
420
+ - Uses TF-IDF for text vectorization
421
+ - Employs Logistic Regression for classification
422
+ - Trained on a synthetic dataset of toxic and non-toxic comments
423
+
424
+ **Browser Extension Simulation:**
425
+ The tool simulates how a browser extension would highlight toxic comments in red
426
+ and civil comments in green, creating a visual content moderation aid.
427
+
428
+ **Potential Applications:**
429
+ - Social media moderation
430
+ - Forum content filtering
431
+ - Online community management
432
+
433
+ **Note:** This is a demonstration using synthetic data. Real-world applications would require
434
+ training on larger, more diverse datasets for improved accuracy.
435
+ """)
436
+
437
+ # Setup event handlers
438
+ analyze_btn.click(
439
+ fn=analyze_single_comment,
440
+ inputs=input_text,
441
+ outputs=[color_box, color_box, toxicity_display]
442
+ )
443
+
444
+ analyze_multi_btn.click(
445
+ fn=highlight_toxic_comments,
446
+ inputs=multi_comments,
447
+ outputs=highlighted_output
448
+ )
449
+
450
+ # Update toxicity display when text changes
451
+ input_text.change(
452
+ fn=lambda x: "0%" if not x.strip() else f"{predict_toxicity(x)['toxicity_score']*100:.1f}%",
453
+ inputs=input_text,
454
+ outputs=toxicity_display
455
+ )
456
+
457
+ # Launch the application
458
+ if __name__ == "__main__":
459
+ demo.launch()