| import streamlit as st | |
| import pandas as pd | |
| st.markdown(""" | |
| <style> | |
| /* Set a soft background color */ | |
| body { | |
| background-color: #eef2f7; | |
| } | |
| /* Style for main title */ | |
| h1 { | |
| color: black; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 700; | |
| text-align: center; | |
| margin-bottom: 25px; | |
| } | |
| /* Style for headers */ | |
| h2 { | |
| color: black; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 600; | |
| margin-top: 30px; | |
| } | |
| /* Style for subheaders */ | |
| h3 { | |
| color: red; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 500; | |
| margin-top: 20px; | |
| } | |
| .custom-subheader { | |
| color: black; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 600; | |
| margin-bottom: 15px; | |
| } | |
| /* Paragraph styling */ | |
| p { | |
| font-family: 'Georgia', serif; | |
| line-height: 1.8; | |
| color: black; | |
| margin-bottom: 20px; | |
| } | |
| /* List styling with checkmark bullets */ | |
| .icon-bullet { | |
| list-style-type: none; | |
| padding-left: 20px; | |
| } | |
| .icon-bullet li { | |
| font-family: 'Georgia', serif; | |
| font-size: 1.1em; | |
| margin-bottom: 10px; | |
| color: black; | |
| } | |
| .icon-bullet li::before { | |
| content: "◆"; | |
| padding-right: 10px; | |
| color: black; | |
| } | |
| /* Sidebar styling */ | |
| .sidebar .sidebar-content { | |
| background-color: #ffffff; | |
| border-radius: 10px; | |
| padding: 15px; | |
| } | |
| .sidebar h2 { | |
| color: #495057; | |
| } | |
| /* Custom button style */ | |
| .streamlit-button { | |
| background-color: #00FFFF; | |
| color: #000000; | |
| font-weight: bold; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| st.header(":red[Simple EDA💬]") | |
| st.markdown(''' | |
| - Simple EDA is a part of life cycle in NLP where after collecting the raw data we need to perform simple eda which tells the quallty of the data | |
| - Simpl EDA is not performed based on the probelm statement | |
| - It checks the exploration of the data | |
| ''') | |
| st.subheader(":violet[Major Simple EDA📃]") | |
| st.markdown(''' | |
| - Whether all the alphabets are in | |
| - lower case | |
| - upper case | |
| - combination of lower and upper case | |
| - Whether the collected text data contains any html / url tags | |
| - Whether the collected text data contains any urls | |
| - Whether the collected text data contains any mentions / hashtags | |
| - Whether the collected text data contains any digits | |
| - Whether the collected text data contains any punctuations | |
| - Whether the collected text data contains any emojis | |
| - Whether the collected text data contains any data /time | |
| ''') | |
| st.code(''' | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import emoji | |
| def simple_eda(data,column): | |
| lower_upper = data[column].apply(lambda x:True if (x.lower()) or (x.upper()) else False).sum() | |
| tags = data[column].apply(lambda x:True if re.search("<.*?>",x) else False).sum() | |
| urls = data[column].apply(lambda x:True if re.search("https://\S+",x) else False).sum() | |
| mails = data[column].apply(lambda x:True if re.search("\S+@\S+",x) else False).sum() | |
| mentions = data[column].apply(lambda x:True if re.search("\B[@#]\S+",x) else False).sum() | |
| emojis = data[column].apply(lambda x:True if emoji.emoji_count(x) else False).sum() | |
| digit = data[column].apply(lambda x:True if re.search("\d",x) else False).sum() | |
| punc = data[column].apply(lambda x:True if re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',x) else False).sum() | |
| dates = data[column].apply(lambda x:True if re.search(r"^[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}$",x) else False).sum() | |
| if lower_upper >0: | |
| print("text have combination") | |
| if tags > 0: | |
| print("text have tags") | |
| if urls >0: | |
| print("text have urls") | |
| if mails > 0: | |
| print("text have mails") | |
| if mentions >0: | |
| print("text have mentions") | |
| if emojis > 0: | |
| print("text have emojis") | |
| if digit >0: | |
| print("text have digit") | |
| if punc > 0: | |
| print("text have punctuations") | |
| if dates >0: | |
| print("text have dates") | |
| ''') | |
| st.markdown(''' | |
| - By the following code we will check the exploration of the data | |
| - Basically it gives the quality of collected text data | |
| - After the simple eda we will perform pre-processing on text based on problem statement after knowing quality of the data | |
| ''') |