|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
/* Set a soft background color */ |
|
|
body { |
|
|
background-color: #eef2f7; |
|
|
} |
|
|
/* Style for main title */ |
|
|
h1 { |
|
|
color: black; |
|
|
font-family: 'Roboto', sans-serif; |
|
|
font-weight: 700; |
|
|
text-align: center; |
|
|
margin-bottom: 25px; |
|
|
} |
|
|
/* Style for headers */ |
|
|
h2 { |
|
|
color: black; |
|
|
font-family: 'Roboto', sans-serif; |
|
|
font-weight: 600; |
|
|
margin-top: 30px; |
|
|
} |
|
|
|
|
|
/* Style for subheaders */ |
|
|
h3 { |
|
|
color: red; |
|
|
font-family: 'Roboto', sans-serif; |
|
|
font-weight: 500; |
|
|
margin-top: 20px; |
|
|
} |
|
|
.custom-subheader { |
|
|
color: black; |
|
|
font-family: 'Roboto', sans-serif; |
|
|
font-weight: 600; |
|
|
margin-bottom: 15px; |
|
|
} |
|
|
/* Paragraph styling */ |
|
|
p { |
|
|
font-family: 'Georgia', serif; |
|
|
line-height: 1.8; |
|
|
color: black; |
|
|
margin-bottom: 20px; |
|
|
} |
|
|
/* List styling with checkmark bullets */ |
|
|
.icon-bullet { |
|
|
list-style-type: none; |
|
|
padding-left: 20px; |
|
|
} |
|
|
.icon-bullet li { |
|
|
font-family: 'Georgia', serif; |
|
|
font-size: 1.1em; |
|
|
margin-bottom: 10px; |
|
|
color: black; |
|
|
} |
|
|
.icon-bullet li::before { |
|
|
content: "β"; |
|
|
padding-right: 10px; |
|
|
color: black; |
|
|
} |
|
|
/* Sidebar styling */ |
|
|
.sidebar .sidebar-content { |
|
|
background-color: #ffffff; |
|
|
border-radius: 10px; |
|
|
padding: 15px; |
|
|
} |
|
|
.sidebar h2 { |
|
|
color: #495057; |
|
|
} |
|
|
/* Custom button style */ |
|
|
.streamlit-button { |
|
|
background-color: #00FFFF; |
|
|
color: #000000; |
|
|
font-weight: bold; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.header(":red[π Simple EDA π¬]") |
|
|
|
|
|
|
|
|
st.markdown("<div class='section'>", unsafe_allow_html=True) |
|
|
st.markdown("<h2 class='title'>π Understanding Simple EDA</h2>", unsafe_allow_html=True) |
|
|
st.markdown("<p class='subtitle'>Evaluating raw text data quality before processing</p>", unsafe_allow_html=True) |
|
|
|
|
|
st.info("π **Simple EDA is a crucial step in the NLP lifecycle:**\n\nβ
Ensures raw data quality\n\nβ
Not dependent on problem statement\n\nβ
Helps in better data exploration") |
|
|
|
|
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
st.subheader(":violet[π Major Simple EDA Steps]") |
|
|
|
|
|
st.markdown("β
**Check Text Case** β Identify if text is in **lowercase, uppercase, or mixed case**.") |
|
|
st.markdown("β
**Detect HTML & URL Tags** β Analyze if text contains unwanted elements.") |
|
|
st.markdown("β
**Identify URLs** β Ensure URLs are either preserved or removed based on problem statement.") |
|
|
st.markdown("β
**Detect Mentions & Hashtags** β Find occurrences of `@mentions` or `#hashtags`.") |
|
|
st.markdown("β
**Identify Numeric Data** β Detect if text includes **digits or numerical data**.") |
|
|
st.markdown("β
**Analyze Punctuation Usage** β Check whether punctuation marks affect text clarity.") |
|
|
st.markdown("β
**Detect Emojis** β Ensure **emoji-based sentiments** are not lost.") |
|
|
st.markdown("β
**Analyze Date/Time Formats** β Identify the presence of date/time-related text.") |
|
|
|
|
|
st.success("π Performing **Simple EDA** ensures structured and high-quality text data, leading to better NLP model performance!") |
|
|
|
|
|
|
|
|
st.code(''' |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import re |
|
|
import emoji |
|
|
|
|
|
def simple_eda(data,column): |
|
|
lower_upper = data[column].apply(lambda x:True if (x.lower()) or (x.upper()) else False).sum() |
|
|
tags = data[column].apply(lambda x:True if re.search("<.*?>",x) else False).sum() |
|
|
urls = data[column].apply(lambda x:True if re.search("https://\S+",x) else False).sum() |
|
|
mails = data[column].apply(lambda x:True if re.search("\S+@\S+",x) else False).sum() |
|
|
mentions = data[column].apply(lambda x:True if re.search("\B[@#]\S+",x) else False).sum() |
|
|
emojis = data[column].apply(lambda x:True if emoji.emoji_count(x) else False).sum() |
|
|
digit = data[column].apply(lambda x:True if re.search("\d",x) else False).sum() |
|
|
punc = data[column].apply(lambda x:True if re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',x) else False).sum() |
|
|
dates = data[column].apply(lambda x:True if re.search(r"^[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}$",x) else False).sum() |
|
|
|
|
|
|
|
|
if lower_upper >0: |
|
|
print("text have combination") |
|
|
if tags > 0: |
|
|
print("text have tags") |
|
|
if urls >0: |
|
|
print("text have urls") |
|
|
if mails > 0: |
|
|
print("text have mails") |
|
|
if mentions >0: |
|
|
print("text have mentions") |
|
|
if emojis > 0: |
|
|
print("text have emojis") |
|
|
if digit >0: |
|
|
print("text have digit") |
|
|
if punc > 0: |
|
|
print("text have punctuations") |
|
|
if dates >0: |
|
|
print("text have dates") |
|
|
|
|
|
''') |
|
|
|
|
|
st.markdown(''' |
|
|
- By the following code we will check the exploration of the data |
|
|
- Basically it gives the quality of collected text data |
|
|
- After the simple eda we will perform pre-processing on text based on problem statement after knowing quality of the data |
|
|
''') |