Spaces:
Running
Running
jl commited on
Commit ·
34b86dc
0
Parent(s):
prototype initial
Browse files- .gitignore +10 -0
- .python-version +1 -0
- README.md +0 -0
- app.py +27 -0
- hatespeech_model.py +52 -0
- pyproject.toml +12 -0
- uv.lock +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python-generated files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[oc]
|
| 4 |
+
build/
|
| 5 |
+
dist/
|
| 6 |
+
wheels/
|
| 7 |
+
*.egg-info
|
| 8 |
+
|
| 9 |
+
# Virtual environments
|
| 10 |
+
.venv
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.14
|
README.md
ADDED
|
File without changes
|
app.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from hatespeech_model import predict_hatespeech
|
| 3 |
+
import random
|
| 4 |
+
|
| 5 |
+
st.set_page_config(page_title="Hatespeech Classifier", layout="centered")
|
| 6 |
+
st.title("Hatespeech Text Classifier")
|
| 7 |
+
st.write("Enter text below to classify if it is hatespeech or not.")
|
| 8 |
+
|
| 9 |
+
user_input = st.text_area("Text to classify", "")
|
| 10 |
+
input_split = user_input.split(" ")
|
| 11 |
+
|
| 12 |
+
word_probabilities = {word: round(random.uniform(0, 1), 2) for word in input_split if word}
|
| 13 |
+
|
| 14 |
+
if st.button("Classify"):
|
| 15 |
+
if user_input.strip():
|
| 16 |
+
result = predict_hatespeech(user_input)
|
| 17 |
+
st.markdown(f"**Result:** {result}")
|
| 18 |
+
col1, col2 = st.columns(2)
|
| 19 |
+
col1.title("Shield Model Results")
|
| 20 |
+
col2.title("Interpretable Shield Model Results")
|
| 21 |
+
col1.write(f"**Result:** {result} ")
|
| 22 |
+
col1.write(f"**Probability:** {random.uniform(0, 1)} ")
|
| 23 |
+
col2.write(f"**Result:** {result}")
|
| 24 |
+
col2.write(f"**Probability:** {random.uniform(0, 1)} ")
|
| 25 |
+
col2.table({"Feature": input_split, "Importance": word_probabilities.values()})
|
| 26 |
+
else:
|
| 27 |
+
st.warning("Please enter some text to classify.")
|
hatespeech_model.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import nltk
|
| 3 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 4 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
# Download NLTK data if not already present
|
| 9 |
+
nltk.download('stopwords', quiet=True)
|
| 10 |
+
from nltk.corpus import stopwords
|
| 11 |
+
|
| 12 |
+
# Example training data (for demonstration)
|
| 13 |
+
data = {
|
| 14 |
+
'text': [
|
| 15 |
+
'I hate you',
|
| 16 |
+
'You are so stupid',
|
| 17 |
+
'Have a nice day',
|
| 18 |
+
'I love this',
|
| 19 |
+
'You are an idiot',
|
| 20 |
+
'What a wonderful world',
|
| 21 |
+
'You are disgusting',
|
| 22 |
+
'Such a pleasant surprise',
|
| 23 |
+
'I despise your actions',
|
| 24 |
+
'You are amazing',
|
| 25 |
+
],
|
| 26 |
+
'label': [1, 1, 0, 0, 1, 0, 1, 0, 1, 0] # 1 = hatespeech, 0 = not
|
| 27 |
+
}
|
| 28 |
+
df = pd.DataFrame(data)
|
| 29 |
+
|
| 30 |
+
# Preprocessing function
|
| 31 |
+
def preprocess(text):
|
| 32 |
+
text = text.lower()
|
| 33 |
+
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
| 34 |
+
tokens = text.split()
|
| 35 |
+
tokens = [t for t in tokens if t not in stopwords.words('english')]
|
| 36 |
+
return ' '.join(tokens)
|
| 37 |
+
|
| 38 |
+
df['text_clean'] = df['text'].apply(preprocess)
|
| 39 |
+
|
| 40 |
+
# Vectorizer and model
|
| 41 |
+
vectorizer = CountVectorizer()
|
| 42 |
+
X = vectorizer.fit_transform(df['text_clean'])
|
| 43 |
+
y = df['label']
|
| 44 |
+
|
| 45 |
+
model = MultinomialNB()
|
| 46 |
+
model.fit(X, y)
|
| 47 |
+
|
| 48 |
+
def predict_hatespeech(text):
|
| 49 |
+
text_clean = preprocess(text)
|
| 50 |
+
X_test = vectorizer.transform([text_clean])
|
| 51 |
+
pred = model.predict(X_test)[0]
|
| 52 |
+
return 'Hatespeech' if pred == 1 else 'Not Hatespeech'
|
pyproject.toml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "proto"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.14"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"nltk>=3.9.2",
|
| 9 |
+
"pandas>=2.3.3",
|
| 10 |
+
"scikit-learn>=1.8.0",
|
| 11 |
+
"streamlit>=1.52.1",
|
| 12 |
+
]
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|