File size: 5,474 Bytes
aed7e2c 896d8db aed7e2c c32fa29 bac792f c32fa29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import streamlit as st
import pandas as pd
st.markdown("""
<style>
/* Set a soft background color */
body {
background-color: #eef2f7;
}
/* Style for main title */
h1 {
color: black;
font-family: 'Roboto', sans-serif;
font-weight: 700;
text-align: center;
margin-bottom: 25px;
}
/* Style for headers */
h2 {
color: black;
font-family: 'Roboto', sans-serif;
font-weight: 600;
margin-top: 30px;
}
/* Style for subheaders */
h3 {
color: red;
font-family: 'Roboto', sans-serif;
font-weight: 500;
margin-top: 20px;
}
.custom-subheader {
color: black;
font-family: 'Roboto', sans-serif;
font-weight: 600;
margin-bottom: 15px;
}
/* Paragraph styling */
p {
font-family: 'Georgia', serif;
line-height: 1.8;
color: black;
margin-bottom: 20px;
}
/* List styling with checkmark bullets */
.icon-bullet {
list-style-type: none;
padding-left: 20px;
}
.icon-bullet li {
font-family: 'Georgia', serif;
font-size: 1.1em;
margin-bottom: 10px;
color: black;
}
.icon-bullet li::before {
content: "β";
padding-right: 10px;
color: black;
}
/* Sidebar styling */
.sidebar .sidebar-content {
background-color: #ffffff;
border-radius: 10px;
padding: 15px;
}
.sidebar h2 {
color: #495057;
}
/* Custom button style */
.streamlit-button {
background-color: #00FFFF;
color: #000000;
font-weight: bold;
}
</style>
""", unsafe_allow_html=True)
st.header(":red[π Simple EDA π¬]")
# Introduction to Simple EDA
st.markdown("<div class='section'>", unsafe_allow_html=True)
st.markdown("<h2 class='title'>π Understanding Simple EDA</h2>", unsafe_allow_html=True)
st.markdown("<p class='subtitle'>Evaluating raw text data quality before processing</p>", unsafe_allow_html=True)
st.info("π **Simple EDA is a crucial step in the NLP lifecycle:**\n\nβ
Ensures raw data quality\n\nβ
Not dependent on problem statement\n\nβ
Helps in better data exploration")
st.markdown("</div>", unsafe_allow_html=True)
st.subheader(":violet[π Major Simple EDA Steps]")
st.markdown("β
**Check Text Case** β Identify if text is in **lowercase, uppercase, or mixed case**.")
st.markdown("β
**Detect HTML & URL Tags** β Analyze if text contains unwanted elements.")
st.markdown("β
**Identify URLs** β Ensure URLs are either preserved or removed based on problem statement.")
st.markdown("β
**Detect Mentions & Hashtags** β Find occurrences of `@mentions` or `#hashtags`.")
st.markdown("β
**Identify Numeric Data** β Detect if text includes **digits or numerical data**.")
st.markdown("β
**Analyze Punctuation Usage** β Check whether punctuation marks affect text clarity.")
st.markdown("β
**Detect Emojis** β Ensure **emoji-based sentiments** are not lost.")
st.markdown("β
**Analyze Date/Time Formats** β Identify the presence of date/time-related text.")
st.success("π Performing **Simple EDA** ensures structured and high-quality text data, leading to better NLP model performance!")
st.code('''
import pandas as pd
import numpy as np
import re
import emoji
def simple_eda(data,column):
lower_upper = data[column].apply(lambda x:True if (x.lower()) or (x.upper()) else False).sum()
tags = data[column].apply(lambda x:True if re.search("<.*?>",x) else False).sum()
urls = data[column].apply(lambda x:True if re.search("https://\S+",x) else False).sum()
mails = data[column].apply(lambda x:True if re.search("\S+@\S+",x) else False).sum()
mentions = data[column].apply(lambda x:True if re.search("\B[@#]\S+",x) else False).sum()
emojis = data[column].apply(lambda x:True if emoji.emoji_count(x) else False).sum()
digit = data[column].apply(lambda x:True if re.search("\d",x) else False).sum()
punc = data[column].apply(lambda x:True if re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',x) else False).sum()
dates = data[column].apply(lambda x:True if re.search(r"^[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}$",x) else False).sum()
if lower_upper >0:
print("text have combination")
if tags > 0:
print("text have tags")
if urls >0:
print("text have urls")
if mails > 0:
print("text have mails")
if mentions >0:
print("text have mentions")
if emojis > 0:
print("text have emojis")
if digit >0:
print("text have digit")
if punc > 0:
print("text have punctuations")
if dates >0:
print("text have dates")
''')
st.markdown('''
- By the following code we will check the exploration of the data
- Basically it gives the quality of collected text data
- After the simple eda we will perform pre-processing on text based on problem statement after knowing quality of the data
''') |