Tharwat-Elsayed commited on
Commit
5f0ea85
·
verified ·
1 Parent(s): 975c327

Upload 17 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ One-layer_BiLSTM_without_dropout.keras filter=lfs diff=lfs merge=lfs -text
7C_10R_Federated_nlp.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Abdullah-N-Moustafa.png ADDED
Ayman Elsayed.jpg ADDED
Decision_Tree_Model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a6e2dcc7ecc08011385f99c5807aa1190203b6dba851c3be687ff2d469d64fd
3
+ size 1764
Hate_Speach.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
LR_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7d25d7c47ac375d12ae28b03c1f1d72ac4ca376cf7248f15897483f74b0f967
3
+ size 3151
One-layer_BiLSTM_without_dropout.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:786dbac898bff4c607cd8a54e6d43f7f709be9ef6ecdbb33b9fe2641152da47a
3
+ size 40773364
README.md CHANGED
@@ -1,3 +1,5 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
1
+ Tweet Tone Triage Technique (4T): A Secured
2
+ Federated Deep Learning Approach
3
+
4
+ You can try at:
5
+ https://tweet-tone-triage-technique-4t.streamlit.app/
Random_Forest_Model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c67a90939eeb7904db3ddde7c7909cacb1cce822f61a1248975628b1e1edf46f
3
+ size 93259
SVM_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c1790caab1c9d946682cc8921c31c3d2bb04afc6411cf0df166c76359ef846d
3
+ size 9357475
Tharwat Elsayed Ismail.JPG ADDED
Tokenized_Padded_tweets.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import required libraries
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ import matplotlib.pyplot as plt
6
+ import io
7
+ import streamlit as st
8
+ from streamlit_option_menu import option_menu
9
+ import re
10
+ from nltk.stem import PorterStemmer
11
+ from nltk.tokenize import word_tokenize
12
+ from tensorflow.keras.preprocessing.text import Tokenizer
13
+ from tensorflow.keras.utils import pad_sequences
14
+ import pickle
15
+ from PIL import Image
16
+
17
+ import numpy as np
18
+ from tensorflow.keras.layers import Layer
19
+ from tensorflow.keras.models import load_model
20
+ from tensorflow.keras import backend as K
21
+
22
+
23
+ # Define the custom attention layer
24
+ class attention(Layer):
25
+ def __init__(self, return_sequences=True, **kwargs):
26
+ self.return_sequences = return_sequences
27
+ super(attention, self).__init__(**kwargs)
28
+
29
+ def build(self, input_shape):
30
+ self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
31
+ initializer="normal")
32
+ self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
33
+ initializer="zeros")
34
+ super(attention, self).build(input_shape)
35
+
36
+ def call(self, x):
37
+ e = K.tanh(K.dot(x, self.W) + self.b)
38
+ a = K.softmax(e, axis=1)
39
+ output = x * a
40
+
41
+ if self.return_sequences:
42
+ return output
43
+
44
+ return K.sum(output, axis=1)
45
+
46
+ def get_config(self):
47
+ config = super(attention, self).get_config()
48
+ config.update({'return_sequences': self.return_sequences})
49
+ return config
50
+
51
+
52
+ # Preprocessing functions
53
+ space_pattern = '\s+'
54
+ giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
55
+ '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
56
+ mention_regex = '@[\w\-]+'
57
+ emoji_regex = '&#[0-9]{4,6};'
58
+
59
+ def preprocess(text_string):
60
+ parsed_text = re.sub(space_pattern, ' ', text_string)
61
+ parsed_text = re.sub(giant_url_regex, '', parsed_text)
62
+ parsed_text = re.sub(mention_regex, '', parsed_text)
63
+ parsed_text = re.sub('RT', '', parsed_text)
64
+ parsed_text = re.sub(emoji_regex, '', parsed_text)
65
+ parsed_text = re.sub('…', '', parsed_text)
66
+ return parsed_text
67
+
68
+ def preprocess_clean(text_string, remove_hashtags=True, remove_special_chars=True):
69
+ text_string = preprocess(text_string)
70
+ parsed_text = text_string.lower()
71
+ parsed_text = re.sub('\'', '', parsed_text)
72
+ parsed_text = re.sub(':', '', parsed_text)
73
+ parsed_text = re.sub(',', '', parsed_text)
74
+ parsed_text = re.sub('&amp', '', parsed_text)
75
+
76
+ if remove_hashtags:
77
+ parsed_text = re.sub('#[\w\-]+', '', parsed_text)
78
+ if remove_special_chars:
79
+ parsed_text = re.sub('(\!|\?)+', '', parsed_text)
80
+ return parsed_text
81
+
82
+ def strip_hashtags(text):
83
+ text = preprocess_clean(text, False, True)
84
+ hashtags = re.findall('#[\w\-]+', text)
85
+ for tag in hashtags:
86
+ cleantag = tag[1:]
87
+ text = re.sub(tag, cleantag, text)
88
+ return text
89
+
90
+ # Stemming function
91
+ stemmer = PorterStemmer()
92
+ def stemming(text):
93
+ stemmed_tweets = [stemmer.stem(t) for t in text.split()]
94
+ return stemmed_tweets
95
+
96
+ # Set the page layout to wide mode
97
+ st.set_page_config(layout="wide")
98
+
99
+ # Load the dataset
100
+ df = pd.read_csv('labeled_data.csv')
101
+
102
+ # Set Streamlit page title
103
+ #st.title('Hate Speech and Offensive Language Analysis')
104
+
105
+ # Create a vertical tab menu in the sidebar
106
+ with st.sidebar:
107
+ selected = option_menu(
108
+ menu_title="Tweet Tone Triage Technique (4T): A Secured Federated Deep Learning Approach", # Title of the menu
109
+ options=["Data Acquisition", "Data Exploration", "Data Classes Balancing", "Data Preparation", "ML Model Selection", "Try The Model", "About", "Contact"], # Menu options
110
+ icons=["house","cloud", "list", "gear", "graph-up", "briefcase","info","envelope"], # Optional icons
111
+ menu_icon="cast", # Icon for the menu title
112
+ default_index=5, # Default selected option
113
+ orientation="vertical" # Set the orientation to vertical
114
+ )
115
+
116
+ # Display content based on selected tab
117
+ if selected == "Data Acquisition":
118
+ st.title("Hate Speech and Offensive Language Dataset")
119
+ st.write("""This dataset contains data related to hate speech and offensive language.
120
+ Davidson introduced a dataset of tweets categorized using a crowdsourced hate speech vocabulary.
121
+ These tweets were classified into three categories: hate speech, offensive language, and neither.
122
+ The dataset, consisting of 24,802 labeled tweets, includes columns for the number of CrowdFlower coders,
123
+ the count of hate speech and offensive language identifications, and a class label indicating
124
+ the majority opinion: 0 for hate speech, 1 for offensive language, and 2 for neither.\n
125
+ The dataset published in:\n
126
+ Davidson, T., Warmsley, D., Macy, M., & Weber, I. (2017, May). Automated hate speech
127
+ detection and the problem of offensive language. In Proceedings of the international
128
+ AAAI conference on web and social media (Vol. 11, No. 1, pp. 512-515).
129
+
130
+ The Dataset can be downloaded from:
131
+ https://www.kaggle.com/datasets/mrmorj/hate-speech-and-offensive-language-dataset
132
+ https://github.com/t-davidson/hate-speech-and-offensive-language
133
+ """)
134
+ # Horizontal line separator
135
+ st.markdown("---")
136
+
137
+ elif selected == "Data Exploration":
138
+ st.title("Loading and Previewing the Dataset")
139
+ # Create tabs
140
+ tab1, tab2, tab3, tab4 = st.tabs(["Dataset Information", "Dataset Description", "Dataset Overview", "Missing values"])
141
+
142
+ # Tab 1: Dataset Brief Information
143
+ with tab1:
144
+ st.subheader('Dataset Information')
145
+
146
+ # Capture the df.info() output
147
+ buffer = io.StringIO()
148
+ df.info(buf=buffer)
149
+ s = buffer.getvalue()
150
+
151
+ # Display the info in Streamlit
152
+ st.text(s)
153
+
154
+ # Tab 2: Dataset Columns Description
155
+ with tab2:
156
+ st.subheader('Dataset Columns Description')
157
+ st.write(df.describe(include='all'))
158
+
159
+ # Tab 3: Dataset Overview (Before Preprocessing)
160
+ with tab3:
161
+ st.subheader('Dataset Overview (Before Preprocessing)')
162
+ st.write(df.head(10))
163
+
164
+ # Tab 4: Check for missing data
165
+ with tab4:
166
+ # Check for missing data
167
+ st.subheader("Missing values in each column:")
168
+ st.write(df.isnull().sum())
169
+
170
+ # Horizontal line separator
171
+ st.markdown("---")
172
+
173
+ elif selected == "Data Classes Balancing":
174
+ st.title("Understanding Class Distribution")
175
+ # Sample Data (replace this with your actual DataFrame)
176
+ # Ensure 'class' is in your DataFrame (0: Hate Speech, 1: Offensive Language, 2: Neither)
177
+ df_fig = df['class']
178
+ # Class labels
179
+ class_labels = ['Hate Speech', 'Offensive Language', 'Neither']
180
+
181
+ # Create tabs
182
+ tab1, tab2 = st.tabs(["Bar Chart", "Pie Chart"])
183
+
184
+ # Tab 1: Distribution of Classes (Bar Chart)
185
+ with tab1:
186
+ st.subheader('Distribution of Classes (Bar Chart)')
187
+
188
+ # Count occurrences of each class
189
+ class_counts = df_fig.value_counts().reindex([0, 1, 2], fill_value=0)
190
+
191
+ # Create a bar chart using Plotly
192
+ bar_fig = px.bar(
193
+ x=class_labels,
194
+ y=class_counts.values,
195
+ labels={'x': 'Class', 'y': 'Frequency'},
196
+ title='Distribution of Classes',
197
+ color=class_labels,
198
+ )
199
+
200
+ # Show the bar chart
201
+ st.plotly_chart(bar_fig)
202
+
203
+ # Tab 2: Proportion of Classes (Pie Chart)
204
+ with tab2:
205
+ st.subheader('Proportion of Classes (Pie Chart)')
206
+
207
+ # Create a pie chart using Plotly
208
+ pie_fig = go.Figure(
209
+ data=[go.Pie(
210
+ labels=class_labels,
211
+ values=class_counts.values,
212
+ hole=0.3, # Make it a donut chart for style
213
+ pull=[0, 0.1, 0], # Pull out the second slice slightly
214
+ marker=dict(colors=['#FF6347', '#FFD700', '#90EE90']),
215
+ textinfo='label+percent',
216
+ hoverinfo='label+value'
217
+ )]
218
+ )
219
+
220
+ pie_fig.update_layout(
221
+ title_text="Distribution of Classes (Pie Chart)",
222
+ showlegend=True
223
+ )
224
+
225
+ # Show the pie chart
226
+ st.plotly_chart(pie_fig)
227
+ # Horizontal line separator
228
+ st.markdown("---")
229
+
230
+ elif selected == "Data Preparation":
231
+ st.title("Dataset Preprocessing")
232
+
233
+ st.write("""
234
+ We used slight pre-processing to normalize the tweets content by:
235
+ A) Delete the characters outlined here (— : , ; ! ?).
236
+ B) Normalize hashtags into words, thus ’refugeesnotwelcome’ becomes ’refugees not welcome’.
237
+ This is due to the fact that such hashtags are frequently employed when creating phrases.
238
+ C) We separate such hashtags using a dictionary-based lookup.
239
+ D) To eliminate word inflections, use lowercase to remove capital letters and stemming to overcome the problem of several forms of words.
240
+ E) Encode the tweets into integers and pad each tweet to the max length of 100 words.
241
+
242
+ """)
243
+ # Horizontal line separator
244
+ st.markdown("---")
245
+
246
+ # Create tabs
247
+ tab1, tab2, tab3, tab4 = st.tabs(["Tweets Before Preprocessing", "Cleaned Tweets", "Stemmed Tweets", "Tokenized Tweets"])
248
+
249
+ # Tab 1: Tweets Before Preprocessing
250
+ with tab1:
251
+ st.subheader('Tweets Before Preprocessing')
252
+ st.write(df.tweet)
253
+ # Horizontal line separator
254
+ st.markdown("---")
255
+
256
+ # Tab 2: Tweets After Cleaning
257
+ with tab2:
258
+ st.subheader('Tweets After Cleaning')
259
+ st.write(pd.read_csv('cleaned_tweets.csv'))
260
+ # Horizontal line separator
261
+ st.markdown("---")
262
+
263
+ # Tab 3: Tweets After Stemming
264
+ with tab3:
265
+ st.subheader('Tweets After Stemming')
266
+ st.write(pd.read_csv('stemmed_tweets.csv'))
267
+ # Horizontal line separator
268
+ st.markdown("---")
269
+
270
+ # Tab 4: Tweets After Tokenization
271
+ with tab4:
272
+ st.subheader('Tweets After Tokenization')
273
+ st.write(pd.read_csv('Tokenized_Padded_tweets.csv'))
274
+ # Horizontal line separator
275
+ st.markdown("---")
276
+
277
+ elif selected == "ML Model Selection":
278
+ st.title("Model Selection")
279
+ st.write("""
280
+ (Classifier training and testing): Ten-fold cross-validation was used to train
281
+ and test all the six classifiers (logistic regression, decision tree, random forest,
282
+ naive Bayes, k-nearest neighbors, and support vector machines). We utilized
283
+ traditional machine learning methods provided by the Scikit-learn Python module
284
+ for classification. The Logistic Regression class uses L2 regularization with
285
+ a regularization parameter C equals 0.01. The hyper parameter used value of maximum depth
286
+ in decision trees and random forest equals 2. The hyper parameter used value of k in
287
+ k-nearest neighbors is 5, this means that the algorithm will consider the class or value of
288
+ the 5 nearest neighbors, when making predictions. In naive Bayes there are no specific default
289
+ values for this algorithm, as it does not require tuning hyper parameters. The hyper parameter
290
+ used value of C in SVM is 1.0.""")
291
+ # Horizontal line separator
292
+ st.markdown("---")
293
+ tab1, tab2 = st.tabs(["Classification Results", "Display Results Figures"])
294
+ # Tab 3: Table I. Classification Results
295
+ with tab1:
296
+ st.subheader('Table I. Classification Results')
297
+ # Define the data for the table
298
+ data = {
299
+ 'Algorithm': ['Logistic Regression', 'Decision Tree', 'Random Forest',
300
+ 'Naive Bayes', 'K-Nearest Neighbor', 'SVM - SVC'],
301
+ 'Precision': ['0.83 ± 0.04', '0.77 ± 0.06', '0.77 ± 0.06', '0.71 ± 0.07', '0.79 ± 0.05', '0.78 ± 0.05'],
302
+ 'Recall': ['0.96 ± 0.02', '1.00 ± 0.01', '1.00 ± 0.01', '0.96 ± 0.02', '0.90 ± 0.03', '1.00 ± 0.01'],
303
+ 'F1-Score': ['0.88 ± 0.02', '0.87 ± 0.03', '0.87 ± 0.03', '0.81 ± 0.04', '0.84 ± 0.04', '0.87 ± 0.03']
304
+ }
305
+
306
+ # Convert the data to a pandas DataFrame
307
+ df_results = pd.DataFrame(data)
308
+
309
+ # Display the table in Streamlit
310
+ st.table(df_results)
311
+ # Horizontal line separator
312
+ st.markdown("---")
313
+
314
+ # Tab 2: Display Results Figures
315
+ with tab2:
316
+ st.subheader('Display Results Figures')
317
+ # Data for the table
318
+ data = {
319
+ 'Algorithm': ['Logistic Regression', 'Decision Tree', 'Random Forest',
320
+ 'Naive Bayes', 'K-Nearest Neighbor', 'SVM - SVC'],
321
+ 'Precision': [0.83, 0.77, 0.77, 0.71, 0.79, 0.78],
322
+ 'Recall': [0.96, 1.00, 1.00, 0.96, 0.90, 1.00],
323
+ 'F1-Score': [0.88, 0.87, 0.87, 0.81, 0.84, 0.87]
324
+ }
325
+
326
+ # Convert the data to a pandas DataFrame (renaming it df_fig)
327
+ df_fig = pd.DataFrame(data)
328
+
329
+ # Create a grouped bar chart using Plotly
330
+ fig = go.Figure()
331
+
332
+ # Add Precision bars
333
+ fig.add_trace(go.Bar(x=df_fig['Algorithm'], y=df_fig['Precision'], name='Precision'))
334
+
335
+ # Add Recall bars
336
+ fig.add_trace(go.Bar(x=df_fig['Algorithm'], y=df_fig['Recall'], name='Recall'))
337
+
338
+ # Add F1-Score bars
339
+ fig.add_trace(go.Bar(x=df_fig['Algorithm'], y=df_fig['F1-Score'], name='F1-Score'))
340
+
341
+ # Update layout for grouped bars
342
+ fig.update_layout(
343
+ title='Classification Results',
344
+ xaxis_title='Algorithm',
345
+ yaxis_title='Score',
346
+ barmode='group', # Group the bars side by side
347
+ xaxis_tickangle=-45
348
+ )
349
+ # Display the plot in Streamlit
350
+ st.plotly_chart(fig)
351
+ st.markdown("---")
352
+
353
+ st.title("Results Clarifaction")
354
+ st.write("""
355
+ Looking at the results, it appears that the Decision Tree, Random Forest,
356
+ and SVM - SVC classifiers have the highest recall scores of 1.00 ± 0.01,
357
+ indicating that they are able to correctly identify all positive instances.
358
+ However, it's important to note that the precision scores for these classifiers
359
+ are slightly lower compared to Logistic Regression and K-Nearest Neighbor.
360
+ But, based on the evaluation metrics for hate speech detection in NLP,
361
+ the best classifier can be determined by considering the F1-score,
362
+ which is a measure of the model's overall performance. By looking at the F1-scores,
363
+ Logistic Regression has the highest F1-score of 0.88 ± 0.02, followed closely by
364
+ Decision Tree, Random Forest, and SVM - SVC, all with F1-scores of 0.87 ± 0.03.
365
+ Therefore, based on the F1-scores, Logistic Regression appears to be the best
366
+ classifier for hate speech detection in NLP. In addition, Logistic Regression has
367
+ the highest precision score of 0.83 ± 0.04. It also has a relatively high recall.""")
368
+ # Horizontal line separator
369
+ st.markdown("---")
370
+
371
+ elif selected == "Try The Model":
372
+ st.title("Tweet Tone Triage Application")
373
+ # Input box for entering the tweet
374
+ user_input = st.text_area("Enter the tweet:", "!!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!")
375
+
376
+ # Button to trigger prediction
377
+ if st.button('Predict'):
378
+ # Preprocessing steps
379
+ preprocessed_tweet = preprocess(user_input)
380
+ clean_tweet = preprocess_clean(preprocessed_tweet)
381
+ stripped_tweet = strip_hashtags(clean_tweet)
382
+ stemmed_tweet = stemming(stripped_tweet)
383
+
384
+ # Tokenize and pad the tweet
385
+ tokenizer = Tokenizer()
386
+ tokenizer.fit_on_texts(stemmed_tweet)
387
+ encoded_docs = tokenizer.texts_to_sequences(stemmed_tweet)
388
+ encoded_docs = [item for sublist in encoded_docs for item in sublist]
389
+ max_length = 100
390
+ padded_docs = pad_sequences([encoded_docs], maxlen=max_length, padding='post')
391
+
392
+ # Map the prediction to a human-readable label
393
+ label_map = {0: 'Hate Speech', 1: 'Offensive Language', 2: 'Neither'}
394
+
395
+ # Load the pre-trained Federated Deep Learning model
396
+ #with open('3T.pkl', 'rb') as f:
397
+ SFD_model = load_model("One-layer_BiLSTM_without_dropout.keras", custom_objects={'attention': attention})
398
+
399
+ # Load the pre-trained Logistic Regression model
400
+ with open('LR_model.pkl', 'rb') as f:
401
+ LR_model = pickle.load(f)
402
+
403
+ # Load the pre-trained Decision Tree model
404
+ with open('Random_Forest_Model.pkl', 'rb') as f:
405
+ Random_Forest_Model = pickle.load(f)
406
+
407
+ # Load the pre-trained Random Forest model
408
+ with open('Decision_Tree_Model.pkl', 'rb') as f:
409
+ Decision_Tree_Model = pickle.load(f)
410
+
411
+ # Load the pre-trained SVM - SVC model
412
+ with open('SVM_model.pkl', 'rb') as f:
413
+ SVM_model = pickle.load(f)
414
+
415
+ # Horizontal line separator
416
+ st.markdown("---")
417
+ st.write(f"preprocessed_tweet: {preprocessed_tweet}")
418
+ st.write(f"Cleaned_tweet: {clean_tweet}")
419
+ st.write(f"Stripped_tweet: {stripped_tweet}")
420
+ st.write(f"Stemmed_tweet: {stemmed_tweet}")
421
+ st.write(f"Tokenized_padded_docs: {padded_docs}")
422
+ # Horizontal line separator
423
+ st.markdown("---")
424
+
425
+ # Predict sentiment/class
426
+ predictions = SFD_model.predict(padded_docs)
427
+ y_pred = np.argmax(predictions, axis=1)
428
+
429
+ #y_pred = SFD_model.predict(padded_docs)
430
+ # Display prediction result
431
+ st.write(f"By Using A Secured Federated Deep Learning Model")
432
+ st.write(f"Prediction: {label_map[y_pred[0]]}")
433
+ st.write(f"Prediction_class: {y_pred}")
434
+
435
+ # Horizontal line separator
436
+ st.markdown("---")
437
+
438
+ # Predict sentiment/class
439
+ y_pred = LR_model.predict(padded_docs)
440
+ # Display prediction result
441
+ st.write(f"By Using Logistic Regression algorithm")
442
+ st.write(f"Prediction: {label_map[y_pred[0]]}")
443
+ st.write(f"Prediction_class: {y_pred}")
444
+ # Horizontal line separator
445
+ st.markdown("---")
446
+
447
+ # Predict sentiment/class
448
+ y_pred = Random_Forest_Model.predict(padded_docs)
449
+ # Display prediction result
450
+ st.write(f"By Using Decision Tree algorithm")
451
+ st.write(f"Prediction: {label_map[y_pred[0]]}")
452
+ st.write(f"Prediction_class: {y_pred}")
453
+ # Horizontal line separator
454
+ st.markdown("---")
455
+ # Predict sentiment/class
456
+ y_pred = Decision_Tree_Model.predict(padded_docs)
457
+ # Display prediction result
458
+ st.write(f"By Using Random Forest algorithm")
459
+ st.write(f"Prediction: {label_map[y_pred[0]]}")
460
+ st.write(f"Prediction_class: {y_pred}")
461
+ # Horizontal line separator
462
+ st.markdown("---")
463
+ # Predict sentiment/class
464
+ y_pred = SVM_model.predict(padded_docs)
465
+ # Display prediction result
466
+ st.write(f"By Using SVM-SVC algorithm")
467
+ st.write(f"Prediction: {label_map[y_pred[0]]}")
468
+ st.write(f"Prediction_class: {y_pred}")
469
+ # Horizontal line separator
470
+ st.markdown("---")
471
+
472
+ elif selected == "About":
473
+ st.title("About This App")
474
+
475
+ st.write("""
476
+ This application is designed for the analysis of hate speech and offensive language in tweets.
477
+ It provides several functionalities, including:
478
+
479
+ - Loading and exploring the dataset
480
+ - Understanding class distribution of hate speech, offensive language, and neutral content
481
+ - Preprocessing tweets (removing URLs, mentions, emojis, and special characters)
482
+ - Tokenizing and padding tweet sequences for machine learning models
483
+ - Model selection and classification of tweets using traditional machine learning classifiers
484
+ - Testing a trained model for real-time predictions of tweet sentiment or class
485
+
486
+ **Key Features:**
487
+
488
+ - Utilizes a crowdsourced dataset from Davidson et al. (2017)
489
+ - Supports preprocessing steps like stemming and tokenization
490
+ - Provides an interactive interface for exploring dataset attributes, class distributions, and preprocessing steps
491
+ - Enables users to test machine learning models on custom tweets
492
+
493
+ **References:**
494
+
495
+ - Dataset Source: Davidson, T., Warmsley, D., Macy, M., & Weber, I. (2017). Automated hate speech detection and the problem of offensive language.
496
+ - Available on Kaggle: https://www.kaggle.com/datasets/mrmorj/hate-speech-and-offensive-language-dataset
497
+ """)
498
+
499
+ # Horizontal line separator
500
+ st.markdown("---")
501
+
502
+ elif selected == "Contact":
503
+ # Set page title and header
504
+ st.title("Supervisors")
505
+
506
+ # Introduction text
507
+ st.write("This application was designed and deployed by **Tharwat El-Sayed Ismail**, under the supervision of:")
508
+
509
+ # Load images
510
+ ayman_image = Image.open("Ayman Elsayed.jpg")
511
+ abdallah_image = Image.open("Abdullah-N-Moustafa.png")
512
+ tharwat_image = Image.open("Tharwat Elsayed Ismail.JPG") # Replace with your image path
513
+
514
+ # Display Prof. Dr. Ayman EL-Sayed info and image
515
+ st.subheader("Prof. Dr. Ayman EL-Sayed")
516
+ st.image(ayman_image, caption="Prof. Dr. Ayman EL-Sayed", width=200)
517
+ st.write("[ayman.elsayed@el-eng.menofia.edu.eg](mailto:ayman.elsayed@el-eng.menofia.edu.eg)")
518
+
519
+ # Display Dr. Abdallah Moustafa Nabil info and image
520
+ st.subheader("Dr. Abdallah Moustafa Nabil")
521
+ st.image(abdallah_image, caption="Dr. Abdallah Moustafa Nabil", width=200)
522
+ st.write("[abdalla.moustafa@ejust.edu.eg](mailto:abdalla.moustafa@ejust.edu.eg)")
523
+
524
+ # Display your contact info and image
525
+ st.subheader("Eng. Tharwat El-Sayed Ismail")
526
+ st.image(tharwat_image, caption="Tharwat El-Sayed Ismail", width=200) # Adjust image size as needed
527
+ st.write("[tharwat.elsayed@el-eng.menofia.edu.eg](mailto:tharwat.elsayed@el-eng.menofia.edu.eg)")
528
+
529
+ # Horizontal line separator
530
+ st.markdown("---")
531
+ st.title("Contact Me")
532
+
533
+ st.write("""
534
+ I’m Tharwat El-Sayed Ismail, (Data Scientist - AI Developer) I am a Data Scientist with expertise in statistical analysis, machine learning (ML), and data visualization, I bring a wealth of experience in Python, adept at extracting actionable insights to inform strategic decisions and effectively solve real-world problems. Additionally, I am an AI Developer proficient in Python, TensorFlow, and PyTorch, specialized in creating scalable AI solutions to drive business growth and enhance user experiences. Highly skilled in machine learning, natural language processing (NLP).
535
+
536
+ **Contact Information:**
537
+
538
+ - **Email:** tharwat_uss89@hotmail.com
539
+ - **LinkedIn:** [Tharwat El-Sayed](www.linkedin.com/in/tharwat-el-sayed-706276b1/)
540
+ - **Portfolio:** [View My Work](https://linktr.ee/tharwat.elsayed)
541
+
542
+ I look forward to connecting with you!
543
+ """)
544
+
545
+ # Horizontal line separator
546
+ st.markdown("---")
547
+
cleaned_tweets.csv ADDED
The diff for this file is too large to render. See raw diff
 
labeled_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dash == 2.17.1
2
+ pandas == 2.2.2
3
+ plotly == 5.22.0
4
+ matplotlib==3.8.0
5
+ streamlit == 1.32.0
6
+ streamlit-option-menu==0.3.2
7
+ vaderSentiment
8
+ textstat
9
+ pyenchant
10
+ splitter
11
+ nltk
12
+ numpy
13
+ scikit-learn
14
+ tensorflow
15
+ seaborn
stemmed_tweets.csv ADDED
The diff for this file is too large to render. See raw diff