Spaces:

Harika22
/

Natural_Language_Processing

Sleeping

App Files Files Community

Harika22 commited on Feb 1, 2025

Commit

ae16ad4

verified ·

1 Parent(s): 0391467

Update pages/5_Pre-procesing_of_text.py

Browse files

Files changed (1) hide show

pages/5_Pre-procesing_of_text.py +80 -1

pages/5_Pre-procesing_of_text.py CHANGED Viewed

@@ -233,4 +233,83 @@ st.markdown(
     </div>
     """,
     unsafe_allow_html=True
-)

     </div>
     """,
     unsafe_allow_html=True
+)
+st.code('''
+            from nltk.corpus import stopwords
+            from nltk.stem import PorterStemmer,LancasterStemmer,SnowballStemmer,WordNetLemmatizer
+            from nltk.tokenize import sent_tokenize,word_tokenize
+            def pre_process(data,col,case="lower",tags=True,url=True,mail=True,mentions=True,digits=True,dates=True,emojis=True,contraction=True,stopwordss=True,inflection="stem",stemmer="porter",punc=True):
+                stp = stopwords.words("english")
+                stp.remove("not")
+                ps = PorterStemmer()
+                ls = LancasterStemmer()
+                sb = SnowballStemmer(language="english")
+                wl = WordNetLemmatizer()
+                ## emoji
+                if emojis==True:
+                    data[col] = data[col].apply(lambda x:emoji.demojize(x,delimiters=('','')))
+                else:
+                    pass
+                ## case
+                if case == "lower":
+                    data[col]=data[col].str.lower()
+                elif case == "upper":
+                    data[col]=data[col].str.upper()
+                else:
+                    pass
+                ## tags
+                if tags==True:
+                    data[col] = data[col].apply(lambda x:re.sub("<.*?>"," ",x))
+                else:
+                    pass
+                ## urls
+                if url ==True:
+                    data[col] = data[col].apply(lambda x:re.sub("https://\S+"," ",x))
+                else:
+                    pass
+                ## mails
+                if mail ==True:
+                    data[col] = data[col].apply(lambda x:re.sub("\S+@\S+"," ",x))
+                else:
+                    pass
+                ## mentions
+                if mentions ==True:
+                    data[col] = data[col].apply(lambda x:re.sub("\B[@#]\S+"," ",x))
+                else:
+                    pass
+                ## digits
+                if mentions ==True:
+                    data[col] = data[col].apply(lambda x:re.sub("\d"," ",x))
+                else:
+                    pass
+                ## dates
+                if dates==True:
+                    data[col] = data[col].apply(lambda x:re.sub(r"^[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}$"," ",x))
+                    data[col] = data[col].apply(lambda x:re.sub(r"^[0-9]{4}\/[0-9]{1,2}\/[0-9]{1,2}$"," ",x))
+                else:
+                    pass
+                ## contractions
+                if contraction==True:
+                    data[col]= data[col].apply(lambda x:contractions.fix(x))
+                else:
+                    pass
+                ## punctuations
+                if punc == True:
+                    data[col]=data[col].apply(lambda x:re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'," ",x))
+                else:
+                    pass
+                return data
+''')