Prageeth-1 commited on
Commit
fe09de4
·
verified ·
1 Parent(s): 7d285e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -101
app.py CHANGED
@@ -44,94 +44,6 @@ def load_classification_model():
44
  def load_qa_model():
45
  return pipeline("question-answering", model="deepset/roberta-base-squad2")
46
 
47
- # Preprocessing function (same as in Section 01)
48
- def preprocess_text():
49
- # Lowercase
50
- df["cleaned_content"] = df["content"].str.lower()
51
-
52
- # Remove URLs
53
- def remove_urls(text):
54
- url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
55
- text = url_pattern.sub(r'', text)
56
- return text.strip()
57
-
58
- # applying the function
59
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
60
-
61
- # Remove Emails
62
- def remove_emails(text):
63
- email_pattern = re.compile(r'\S+@\S+')
64
- return email_pattern.sub(r'', text)
65
-
66
- # applying the function
67
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
68
-
69
- #Remove punctuations
70
- def remove_punctuation(text):
71
- return "".join([char for char in text if char not in string.punctuation])
72
-
73
- # applying the function
74
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))
75
-
76
- # Get the list of stop words
77
- stop_words = set(stopwords.words('english'))
78
-
79
- # define the function
80
- def remove_stopwords(text):
81
- return " ".join([word for word in str(text).split() if word not in stop_words])
82
-
83
- # apply the function
84
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
85
-
86
- # define the function
87
- def remove_special_characters(text):
88
- return re.sub(r'[^A-Za-z\s]', '', text)
89
-
90
- # apply the function
91
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
92
-
93
- #Remove Frequent words
94
-
95
- # Get the count of each word in cleaned_text
96
- word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
97
-
98
- # Get a set of common words
99
- common_words = set([word for (word,count) in word_count.most_common(10)])
100
-
101
- # deinfe the function
102
- def remove_common_words(text):
103
- return " ".join([word for word in str(text).split() if word not in common_words])
104
-
105
- # apply the function
106
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
107
-
108
- #Remove rare words
109
- # Get a set of rare words
110
- rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
111
- print(rare_words)
112
-
113
- # define the function
114
- def remove_rare_words(text):
115
- return " ".join([word for word in str(text).split() if word not in rare_words])
116
-
117
- df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
118
-
119
- df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
120
-
121
-
122
- # initialize stemmer
123
- stemmer = PorterStemmer()
124
-
125
- # Defining the function
126
- def stem_tokens(tokens):
127
- stems = [stemmer.stem(token) for token in tokens]
128
- return stems
129
-
130
- # apply the function
131
- df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
132
-
133
- df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
134
-
135
 
136
 
137
  # Function to generate word cloud
@@ -285,25 +197,16 @@ with tab1:
285
 
286
  # apply the function
287
  df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
288
-
289
- df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
290
-
291
-
292
-
293
-
294
 
295
-
296
-
297
-
298
-
299
-
300
 
301
 
 
 
302
  # Classify each article and store the predictions
303
  df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
304
 
305
  #Delete Unnecessary columns
306
- df = df[['content', 'Class']]
307
 
308
 
309
  # Show results
@@ -316,7 +219,8 @@ with tab1:
316
  st.bar_chart(class_dist)
317
 
318
 
319
-
 
320
 
321
 
322
  # Download button
 
44
  def load_qa_model():
45
  return pipeline("question-answering", model="deepset/roberta-base-squad2")
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
 
49
  # Function to generate word cloud
 
197
 
198
  # apply the function
199
  df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
 
 
 
 
 
 
200
 
 
 
 
 
 
201
 
202
 
203
+ df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
204
+
205
  # Classify each article and store the predictions
206
  df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
207
 
208
  #Delete Unnecessary columns
209
+ df = df[['content','preprocessed_content','Class']]
210
 
211
 
212
  # Show results
 
219
  st.bar_chart(class_dist)
220
 
221
 
222
+ #Delete Unnecessary columns
223
+ df = df[['content','Class']]
224
 
225
 
226
  # Download button