Spaces:
No application file
No application file
| title: Comment Classification | |
| emoji: 🏢 | |
| colorFrom: yellow | |
| colorTo: indigo | |
| sdk: gradio | |
| sdk_version: 5.46.1 | |
| app_file: app.py | |
| pinned: false | |
| short_description: comment classification | |
| Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.model_selection import train_test_split | |
| import random | |
| import re | |
| # Create synthetic dataset for toxic and non-toxic comments | |
| def create_synthetic_dataset(): | |
| np.random.seed(42) | |
| random.seed(42) | |
| # Toxic comments patterns | |
| toxic_patterns = [ | |
| "You're such a {insult} who knows nothing about {topic}.", | |
| "Only an {insult} would think that about {topic}.", | |
| "This is the dumbest take on {topic} I've ever seen.", | |
| "Go back to {place}, you {insult}.", | |
| "Why are you so {negative_adj} about everything?", | |
| "Everyone like you should be {threat}.", | |
| "Your opinion is worthless because you're a {insult}.", | |
| "I hope you {threat} for saying that.", | |
| "People like you are the reason why {bad_thing} happens.", | |
| "Shut up, you don't know what you're talking about.", | |
| "You're just a {insult} with no life.", | |
| "How can anyone be this {negative_adj}?", | |
| "I wouldn't expect anything better from a {insult}.", | |
| "Your existence is an insult to {group}.", | |
| "Do everyone a favor and {threat}." | |
| ] | |
| # Non-toxic comments patterns | |
| non_toxic_patterns = [ | |
| "I appreciate your perspective on {topic}.", | |
| "That's an interesting point about {topic}.", | |
| "I see what you mean, but have you considered {alternative_view}?", | |
| "Thanks for sharing your thoughts on {topic}.", | |
| "I respectfully disagree because of {reason}.", | |
| "That's a good question about {topic}.", | |
| "I learned something new about {topic} today.", | |
| "Could you elaborate more on your view about {topic}?", | |
| "I never thought about it that way before.", | |
| "You make a valid point regarding {topic}.", | |
| "I understand where you're coming from.", | |
| "Let's agree to disagree on this one.", | |
| "I value different opinions on {topic}.", | |
| "That's a fair assessment of the situation.", | |
| "I think we have common ground on {shared_view}." | |
| ] | |
| # Fillers for the patterns | |
| insults = ["idiot", "moron", "fool", "jerk", "imbecile", "buffoon", "dimwit", "simpleton", "dunce", "nitwit"] | |
| topics = ["politics", "sports", "technology", "music", "movies", "science", "education", "health", "environment", "economy"] | |
| negative_adjs = ["stupid", "ignorant", "pathetic", "ridiculous", "awful", "terrible", "horrible", "disgusting", "vile", "repulsive"] | |
| places = ["your country", "where you came from", "your mom's basement", "the cave you live in", "under your rock"] | |
| threats = ["die", "disappear", "stop talking", "leave", "get banned", "be quiet", "go away", "never return", "get lost", "vanish"] | |
| bad_things = ["war", "famine", "disease", "poverty", "conflict", "hate", "violence", "discrimination", "suffering", "chaos"] | |
| groups = ["humanity", "society", "this community", "intelligent people", "decent folks"] | |
| alternative_views = ["this other aspect", "the historical context", "the data", "recent developments", "expert opinions"] | |
| reasons = ["my experiences", "the evidence", "what I've read", "statistics", "expert analysis"] | |
| shared_views = ["this issue", "the importance of dialogue", "seeking truth", "finding solutions", "moving forward"] | |
| # Generate toxic comments | |
| toxic_comments = [] | |
| for _ in range(500): | |
| pattern = random.choice(toxic_patterns) | |
| comment = pattern.format( | |
| insult=random.choice(insults), | |
| topic=random.choice(topics), | |
| negative_adj=random.choice(negative_adjs), | |
| place=random.choice(places), | |
| threat=random.choice(threats), | |
| bad_thing=random.choice(bad_things), | |
| group=random.choice(groups) | |
| ) | |
| toxic_comments.append((comment, 1)) | |
| # Generate non-toxic comments | |
| non_toxic_comments = [] | |
| for _ in range(500): | |
| pattern = random.choice(non_toxic_patterns) | |
| comment = pattern.format( | |
| topic=random.choice(topics), | |
| alternative_view=random.choice(alternative_views), | |
| reason=random.choice(reasons), | |
| shared_view=random.choice(shared_views) | |
| ) | |
| non_toxic_comments.append((comment, 0)) | |
| # Combine and shuffle | |
| all_comments = toxic_comments + non_toxic_comments | |
| random.shuffle(all_comments) | |
| # Create DataFrame | |
| df = pd.DataFrame(all_comments, columns=['comment', 'toxic']) | |
| return df | |
| # Create and train the model | |
| def create_and_train_model(df): | |
| # Split the data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| df['comment'], df['toxic'], test_size=0.2, random_state=42 | |
| ) | |
| # Vectorize the text | |
| vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') | |
| X_train_vec = vectorizer.fit_transform(X_train) | |
| X_test_vec = vectorizer.transform(X_test) | |
| # Train the model | |
| model = LogisticRegression(max_iter=1000, random_state=42) | |
| model.fit(X_train_vec, y_train) | |
| return model, vectorizer | |
| # Create the synthetic dataset and train the model | |
| df = create_synthetic_dataset() | |
| model, vectorizer = create_and_train_model(df) | |
| # Function to predict toxicity | |
| def predict_toxicity(comment): | |
| if not comment.strip(): | |
| return {"toxic": False, "toxicity_score": 0.0, "display_text": "No text provided"} | |
| # Vectorize the comment | |
| comment_vec = vectorizer.transform([comment]) | |
| # Predict | |
| prediction = model.predict_proba(comment_vec)[0] | |
| toxic_prob = prediction[1] # Probability of being toxic | |
| # Determine if toxic | |
| is_toxic = toxic_prob > 0.7 | |
| return { | |
| "toxic": is_toxic, | |
| "toxicity_score": float(toxic_prob), | |
| "display_text": comment | |
| } | |
| # Function to simulate browser extension highlighting | |
| def highlight_toxic_comments(text): | |
| if not text.strip(): | |
| return "<div style='font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; color: #666;'>No comments to analyze</div>" | |
| # Split into comments (assuming each line is a comment) | |
| comments = text.split('\n') | |
| highlighted_html = "<div style='font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto;'>" | |
| for comment in comments: | |
| if not comment.strip(): | |
| continue | |
| result = predict_toxicity(comment) | |
| if result['toxic']: | |
| # Highlight toxic comments in red | |
| highlighted_html += f""" | |
| <div style=' | |
| background-color: #ffebee; | |
| border-left: 5px solid #f44336; | |
| padding: 12px; | |
| margin: 10px 0; | |
| border-radius: 4px; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| '> | |
| <div style='display: flex; justify-content: space-between; align-items: center;'> | |
| <span style='color: #d32f2f; font-weight: bold;'>⚠️ Toxic Comment</span> | |
| <span style='color: #888; font-size: 0.9em;'>Toxicity: {result['toxicity_score']*100:.1f}%</span> | |
| </div> | |
| <p style='margin: 8px 0; color: #333;'>{comment}</p> | |
| </div> | |
| """ | |
| else: | |
| # Keep non-toxic comments normal | |
| highlighted_html += f""" | |
| <div style=' | |
| background-color: #f5f5f5; | |
| border-left: 5px solid #4caf50; | |
| padding: 12px; | |
| margin: 10px 0; | |
| border-radius: 4px; | |
| box-shadow: 0 1px 3px rgba(0,0,0,0.1); | |
| '> | |
| <div style='display: flex; justify-content: space-between; align-items: center;'> | |
| <span style='color: #388e3c; font-weight: bold;'>✓ Civil Comment</span> | |
| <span style='color: #888; font-size: 0.9em;'>Toxicity: {result['toxicity_score']*100:.1f}%</span> | |
| </div> | |
| <p style='margin: 8px 0; color: #333;'>{comment}</p> | |
| </div> | |
| """ | |
| highlighted_html += "</div>" | |
| return highlighted_html | |
| # Function to analyze single comment | |
| def analyze_single_comment(comment): | |
| if not comment.strip(): | |
| return "Please enter a comment to analyze", "white", "0%" | |
| result = predict_toxicity(comment) | |
| if result['toxic']: | |
| return ( | |
| f"⚠️ This comment is classified as TOXIC with a {result['toxicity_score']*100:.1f}% probability.", | |
| "red", | |
| f"{result['toxicity_score']*100:.1f}%" | |
| ) | |
| else: | |
| return ( | |
| f"✓ This comment is CIVIL with a {result['toxicity_score']*100:.1f}% toxicity probability.", | |
| "green", | |
| f"{result['toxicity_score']*100:.1f}%" | |
| ) | |
| # Create custom CSS for styling | |
| custom_css = """ | |
| .gr-button { | |
| background: linear-gradient(45deg, #ff6b6b, #ff8e8e) !important; | |
| color: white !important; | |
| border: none !important; | |
| border-radius: 8px !important; | |
| padding: 12px 24px !important; | |
| font-weight: bold !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .gr-button:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; | |
| } | |
| .gr-button:active { | |
| transform: translateY(0); | |
| } | |
| .toxicity-meter { | |
| background: linear-gradient(90deg, #4caf50 0%, #ffeb3b 50%, #f44336 100%); | |
| height: 20px; | |
| border-radius: 10px; | |
| margin: 10px 0; | |
| position: relative; | |
| } | |
| .toxicity-value { | |
| position: absolute; | |
| top: -25px; | |
| font-weight: bold; | |
| color: #333; | |
| } | |
| h1 { | |
| background: linear-gradient(45deg, #ff6b6b, #ff8e8e); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| text-align: center; | |
| margin-bottom: 20px !important; | |
| } | |
| .gr-box { | |
| border-radius: 12px !important; | |
| border: 2px solid #e0e0e0 !important; | |
| padding: 16px !important; | |
| } | |
| .gr-tab { | |
| border-radius: 12px 12px 0 0 !important; | |
| } | |
| .example-container { | |
| background: #f9f9f9; | |
| padding: 15px; | |
| border-radius: 12px; | |
| margin: 10px 0; | |
| } | |
| .example-comment { | |
| padding: 10px; | |
| margin: 5px 0; | |
| border-radius: 8px; | |
| background: white; | |
| cursor: pointer; | |
| transition: all 0.2s ease; | |
| } | |
| .example-comment:hover { | |
| transform: translateX(5px); | |
| box-shadow: 0 2px 5px rgba(0,0,0,0.1); | |
| } | |
| """ | |
| # Create Gradio interface | |
| with gr.Blocks(title="Toxic Comment Classifier", theme=gr.themes.Soft(), css=custom_css) as demo: | |
| gr.Markdown( | |
| """ | |
| # 🚨 Toxic Comment Classifier | |
| This tool identifies abusive, hateful, or toxic comments using machine learning. | |
| It simulates how a browser extension would highlight toxic content in red. | |
| """ | |
| ) | |
| with gr.Tab("🔍 Single Comment Analysis"): | |
| gr.Markdown("## Analyze a Single Comment") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_text = gr.Textbox( | |
| label="Enter a comment to analyze", | |
| placeholder="Type your comment here...", | |
| lines=3, | |
| elem_classes="gr-box" | |
| ) | |
| analyze_btn = gr.Button("Analyze Comment", variant="primary") | |
| # Toxicity meter | |
| gr.Markdown("### Toxicity Meter") | |
| toxicity_display = gr.Label(label="Toxicity Score", value="0%") | |
| # Visual indicator | |
| gr.Markdown("### Visual Indicator") | |
| color_box = gr.Textbox( | |
| value="Enter a comment to see analysis", | |
| interactive=False, | |
| label="Analysis Result" | |
| ) | |
| with gr.Column(scale=1): | |
| # Examples for single comment | |
| gr.Markdown("### Try These Examples") | |
| with gr.Column(elem_classes="example-container"): | |
| examples = [ | |
| "You're such an idiot who knows nothing about politics.", | |
| "I appreciate your perspective on this topic.", | |
| "People like you are the reason why we have so many problems in society.", | |
| "That's an interesting point about the economy." | |
| ] | |
| for example in examples: | |
| example_btn = gr.Button( | |
| example, | |
| size="sm", | |
| variant="secondary", | |
| elem_classes="example-comment" | |
| ) | |
| example_btn.click( | |
| fn=lambda e=example: e, | |
| inputs=None, | |
| outputs=input_text | |
| ) | |
| with gr.Tab("🌐 Browser Extension Simulator"): | |
| gr.Markdown(""" | |
| ## Browser Extension Simulator | |
| Paste multiple comments (one per line) to simulate how a browser extension would highlight toxic content: | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| multi_comments = gr.Textbox( | |
| label="Comments (one per line)", | |
| placeholder="Enter multiple comments here, one per line...", | |
| lines=10, | |
| elem_classes="gr-box" | |
| ) | |
| analyze_multi_btn = gr.Button("Analyze Comments", variant="primary") | |
| with gr.Column(): | |
| highlighted_output = gr.HTML(label="Highlighted Comments") | |
| # Examples for multiple comments | |
| gr.Markdown("### Example Comment Threads") | |
| with gr.Row(): | |
| with gr.Column(): | |
| example1 = gr.Examples( | |
| examples=[ | |
| """You're such an idiot who knows nothing about politics. | |
| I appreciate your perspective on this topic. | |
| People like you are the reason why we have so many problems in society. | |
| That's an interesting point about the economy. | |
| Everyone like you should be banned from this platform.""" | |
| ], | |
| inputs=multi_comments, | |
| label="Example 1" | |
| ) | |
| with gr.Column(): | |
| example2 = gr.Examples( | |
| examples=[ | |
| """This is the dumbest take on sports I've ever seen. | |
| Thanks for sharing your thoughts on the environment. | |
| I hope you disappear for saying that. | |
| I see what you mean, but have you considered the historical context?""" | |
| ], | |
| inputs=multi_comments, | |
| label="Example 2" | |
| ) | |
| with gr.Tab("📘 About This Project"): | |
| gr.Markdown(""" | |
| ## About the Toxic Comment Classifier | |
| This project demonstrates a machine learning approach to identifying toxic comments online. | |
| **How it works:** | |
| - Uses TF-IDF for text vectorization | |
| - Employs Logistic Regression for classification | |
| - Trained on a synthetic dataset of toxic and non-toxic comments | |
| **Browser Extension Simulation:** | |
| The tool simulates how a browser extension would highlight toxic comments in red | |
| and civil comments in green, creating a visual content moderation aid. | |
| **Potential Applications:** | |
| - Social media moderation | |
| - Forum content filtering | |
| - Online community management | |
| **Note:** This is a demonstration using synthetic data. Real-world applications would require | |
| training on larger, more diverse datasets for improved accuracy. | |
| """) | |
| # Setup event handlers | |
| analyze_btn.click( | |
| fn=analyze_single_comment, | |
| inputs=input_text, | |
| outputs=[color_box, color_box, toxicity_display] | |
| ) | |
| analyze_multi_btn.click( | |
| fn=highlight_toxic_comments, | |
| inputs=multi_comments, | |
| outputs=highlighted_output | |
| ) | |
| # Update toxicity display when text changes | |
| input_text.change( | |
| fn=lambda x: "0%" if not x.strip() else f"{predict_toxicity(x)['toxicity_score']*100:.1f}%", | |
| inputs=input_text, | |
| outputs=toxicity_display | |
| ) | |
| # Launch the application | |
| if __name__ == "__main__": | |
| demo.launch() |