Harika22 commited on
Commit
ae16ad4
·
verified ·
1 Parent(s): 0391467

Update pages/5_Pre-procesing_of_text.py

Browse files
Files changed (1) hide show
  1. pages/5_Pre-procesing_of_text.py +80 -1
pages/5_Pre-procesing_of_text.py CHANGED
@@ -233,4 +233,83 @@ st.markdown(
233
  </div>
234
  """,
235
  unsafe_allow_html=True
236
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  </div>
234
  """,
235
  unsafe_allow_html=True
236
+ )
237
+
238
+ st.code('''
239
+ from nltk.corpus import stopwords
240
+ from nltk.stem import PorterStemmer,LancasterStemmer,SnowballStemmer,WordNetLemmatizer
241
+ from nltk.tokenize import sent_tokenize,word_tokenize
242
+
243
+ def pre_process(data,col,case="lower",tags=True,url=True,mail=True,mentions=True,digits=True,dates=True,emojis=True,contraction=True,stopwordss=True,inflection="stem",stemmer="porter",punc=True):
244
+ stp = stopwords.words("english")
245
+ stp.remove("not")
246
+ ps = PorterStemmer()
247
+ ls = LancasterStemmer()
248
+ sb = SnowballStemmer(language="english")
249
+ wl = WordNetLemmatizer()
250
+
251
+ ## emoji
252
+ if emojis==True:
253
+ data[col] = data[col].apply(lambda x:emoji.demojize(x,delimiters=('','')))
254
+ else:
255
+ pass
256
+
257
+ ## case
258
+ if case == "lower":
259
+ data[col]=data[col].str.lower()
260
+ elif case == "upper":
261
+ data[col]=data[col].str.upper()
262
+ else:
263
+ pass
264
+
265
+ ## tags
266
+ if tags==True:
267
+ data[col] = data[col].apply(lambda x:re.sub("<.*?>"," ",x))
268
+ else:
269
+ pass
270
+
271
+ ## urls
272
+ if url ==True:
273
+ data[col] = data[col].apply(lambda x:re.sub("https://\S+"," ",x))
274
+ else:
275
+ pass
276
+
277
+ ## mails
278
+ if mail ==True:
279
+ data[col] = data[col].apply(lambda x:re.sub("\S+@\S+"," ",x))
280
+ else:
281
+ pass
282
+
283
+ ## mentions
284
+ if mentions ==True:
285
+ data[col] = data[col].apply(lambda x:re.sub("\B[@#]\S+"," ",x))
286
+ else:
287
+ pass
288
+
289
+ ## digits
290
+ if mentions ==True:
291
+ data[col] = data[col].apply(lambda x:re.sub("\d"," ",x))
292
+ else:
293
+ pass
294
+
295
+ ## dates
296
+ if dates==True:
297
+ data[col] = data[col].apply(lambda x:re.sub(r"^[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}$"," ",x))
298
+ data[col] = data[col].apply(lambda x:re.sub(r"^[0-9]{4}\/[0-9]{1,2}\/[0-9]{1,2}$"," ",x))
299
+ else:
300
+ pass
301
+
302
+ ## contractions
303
+ if contraction==True:
304
+ data[col]= data[col].apply(lambda x:contractions.fix(x))
305
+ else:
306
+ pass
307
+
308
+ ## punctuations
309
+ if punc == True:
310
+ data[col]=data[col].apply(lambda x:re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'," ",x))
311
+ else:
312
+ pass
313
+
314
+ return data
315
+ ''')