Prageeth-1 commited on
Commit
5ddd7af
·
verified ·
1 Parent(s): 991db0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -1
app.py CHANGED
@@ -190,7 +190,100 @@ with tab1:
190
  classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
191
 
192
  # Preprocess
193
- preprocess_text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
 
196
  # Classify each article and store the predictions
 
190
  classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
191
 
192
  # Preprocess
193
+ # Lowercase
194
+ df["cleaned_content"] = df["content"].str.lower()
195
+
196
+ # Remove URLs
197
+ def remove_urls(text):
198
+ url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
199
+ text = url_pattern.sub(r'', text)
200
+ return text.strip()
201
+
202
+ # applying the function
203
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
204
+
205
+ # Remove Emails
206
+ def remove_emails(text):
207
+ email_pattern = re.compile(r'\S+@\S+')
208
+ return email_pattern.sub(r'', text)
209
+
210
+ # applying the function
211
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
212
+
213
+ #Remove punctuations
214
+ def remove_punctuation(text):
215
+ return "".join([char for char in text if char not in string.punctuation])
216
+
217
+ # applying the function
218
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))
219
+
220
+ # Get the list of stop words
221
+ stop_words = set(stopwords.words('english'))
222
+
223
+ # define the function
224
+ def remove_stopwords(text):
225
+ return " ".join([word for word in str(text).split() if word not in stop_words])
226
+
227
+ # apply the function
228
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
229
+
230
+ # define the function
231
+ def remove_special_characters(text):
232
+ return re.sub(r'[^A-Za-z\s]', '', text)
233
+
234
+ # apply the function
235
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
236
+
237
+ #Remove Frequent words
238
+
239
+ # Get the count of each word in cleaned_text
240
+ word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
241
+
242
+ # Get a set of common words
243
+ common_words = set([word for (word,count) in word_count.most_common(10)])
244
+
245
+ # deinfe the function
246
+ def remove_common_words(text):
247
+ return " ".join([word for word in str(text).split() if word not in common_words])
248
+
249
+ # apply the function
250
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
251
+
252
+ #Remove rare words
253
+ # Get a set of rare words
254
+ rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
255
+ print(rare_words)
256
+
257
+ # define the function
258
+ def remove_rare_words(text):
259
+ return " ".join([word for word in str(text).split() if word not in rare_words])
260
+
261
+ df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
262
+
263
+ df['tokenized_content'] = df['cleaned_content'].apply(lambda text: nltk.word_tokenize(text))
264
+
265
+ # initialize stemmer
266
+ stemmer = PorterStemmer()
267
+
268
+ # Defining the function
269
+ def stem_tokens(tokens):
270
+ stems = [stemmer.stem(token) for token in tokens]
271
+ return stems
272
+
273
+ # apply the function
274
+ df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
275
+
276
+ df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
 
288
 
289
  # Classify each article and store the predictions