LakshmiHarika commited on
Commit
317b214
Β·
verified Β·
1 Parent(s): 8380b3c

Create 4.Simple EDA.py

Browse files
Files changed (1) hide show
  1. pages/4.Simple EDA.py +147 -0
pages/4.Simple EDA.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ st.markdown("""
5
+ <style>
6
+ /* Set a soft background color */
7
+ body {
8
+ background-color: #eef2f7;
9
+ }
10
+ /* Style for main title */
11
+ h1 {
12
+ color: black;
13
+ font-family: 'Roboto', sans-serif;
14
+ font-weight: 700;
15
+ text-align: center;
16
+ margin-bottom: 25px;
17
+ }
18
+ /* Style for headers */
19
+ h2 {
20
+ color: black;
21
+ font-family: 'Roboto', sans-serif;
22
+ font-weight: 600;
23
+ margin-top: 30px;
24
+ }
25
+
26
+ /* Style for subheaders */
27
+ h3 {
28
+ color: red;
29
+ font-family: 'Roboto', sans-serif;
30
+ font-weight: 500;
31
+ margin-top: 20px;
32
+ }
33
+ .custom-subheader {
34
+ color: black;
35
+ font-family: 'Roboto', sans-serif;
36
+ font-weight: 600;
37
+ margin-bottom: 15px;
38
+ }
39
+ /* Paragraph styling */
40
+ p {
41
+ font-family: 'Georgia', serif;
42
+ line-height: 1.8;
43
+ color: black;
44
+ margin-bottom: 20px;
45
+ }
46
+ /* List styling with checkmark bullets */
47
+ .icon-bullet {
48
+ list-style-type: none;
49
+ padding-left: 20px;
50
+ }
51
+ .icon-bullet li {
52
+ font-family: 'Georgia', serif;
53
+ font-size: 1.1em;
54
+ margin-bottom: 10px;
55
+ color: black;
56
+ }
57
+ .icon-bullet li::before {
58
+ content: "β—†";
59
+ padding-right: 10px;
60
+ color: black;
61
+ }
62
+ /* Sidebar styling */
63
+ .sidebar .sidebar-content {
64
+ background-color: #ffffff;
65
+ border-radius: 10px;
66
+ padding: 15px;
67
+ }
68
+ .sidebar h2 {
69
+ color: #495057;
70
+ }
71
+ /* Custom button style */
72
+ .streamlit-button {
73
+ background-color: #00FFFF;
74
+ color: #000000;
75
+ font-weight: bold;
76
+ }
77
+ </style>
78
+ """, unsafe_allow_html=True)
79
+
80
+ st.header(":red[πŸ“Š Simple EDA πŸ’¬]")
81
+
82
+ # Introduction to Simple EDA
83
+ st.markdown("<div class='section'>", unsafe_allow_html=True)
84
+ st.markdown("<h2 class='title'>πŸ” Understanding Simple EDA</h2>", unsafe_allow_html=True)
85
+ st.markdown("<p class='subtitle'>Evaluating raw text data quality before processing</p>", unsafe_allow_html=True)
86
+
87
+ st.info("πŸ“Œ **Simple EDA is a crucial step in the NLP lifecycle:**\n\nβœ… Ensures raw data quality\n\nβœ… Not dependent on problem statement\n\nβœ… Helps in better data exploration")
88
+
89
+ st.markdown("</div>", unsafe_allow_html=True)
90
+
91
+ st.subheader(":violet[πŸ“ƒ Major Simple EDA Steps]")
92
+
93
+ st.markdown("βœ… **Check Text Case** – Identify if text is in **lowercase, uppercase, or mixed case**.")
94
+ st.markdown("βœ… **Detect HTML & URL Tags** – Analyze if text contains unwanted elements.")
95
+ st.markdown("βœ… **Identify URLs** – Ensure URLs are either preserved or removed based on problem statement.")
96
+ st.markdown("βœ… **Detect Mentions & Hashtags** – Find occurrences of `@mentions` or `#hashtags`.")
97
+ st.markdown("βœ… **Identify Numeric Data** – Detect if text includes **digits or numerical data**.")
98
+ st.markdown("βœ… **Analyze Punctuation Usage** – Check whether punctuation marks affect text clarity.")
99
+ st.markdown("βœ… **Detect Emojis** – Ensure **emoji-based sentiments** are not lost.")
100
+ st.markdown("βœ… **Analyze Date/Time Formats** – Identify the presence of date/time-related text.")
101
+
102
+ st.success("πŸš€ Performing **Simple EDA** ensures structured and high-quality text data, leading to better NLP model performance!")
103
+
104
+
105
+ st.code('''
106
+ import pandas as pd
107
+ import numpy as np
108
+ import re
109
+ import emoji
110
+
111
+ def simple_eda(data,column):
112
+ lower_upper = data[column].apply(lambda x:True if (x.lower()) or (x.upper()) else False).sum()
113
+ tags = data[column].apply(lambda x:True if re.search("<.*?>",x) else False).sum()
114
+ urls = data[column].apply(lambda x:True if re.search("https://\S+",x) else False).sum()
115
+ mails = data[column].apply(lambda x:True if re.search("\S+@\S+",x) else False).sum()
116
+ mentions = data[column].apply(lambda x:True if re.search("\B[@#]\S+",x) else False).sum()
117
+ emojis = data[column].apply(lambda x:True if emoji.emoji_count(x) else False).sum()
118
+ digit = data[column].apply(lambda x:True if re.search("\d",x) else False).sum()
119
+ punc = data[column].apply(lambda x:True if re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',x) else False).sum()
120
+ dates = data[column].apply(lambda x:True if re.search(r"^[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}$",x) else False).sum()
121
+
122
+ if lower_upper >0:
123
+ print("text have combination")
124
+ if tags > 0:
125
+ print("text have tags")
126
+ if urls >0:
127
+ print("text have urls")
128
+ if mails > 0:
129
+ print("text have mails")
130
+ if mentions >0:
131
+ print("text have mentions")
132
+ if emojis > 0:
133
+ print("text have emojis")
134
+ if digit >0:
135
+ print("text have digit")
136
+ if punc > 0:
137
+ print("text have punctuations")
138
+ if dates >0:
139
+ print("text have dates")
140
+
141
+ ''')
142
+
143
+ st.markdown('''
144
+ - By the following code we will check the exploration of the data
145
+ - Basically it gives the quality of collected text data
146
+ - After the simple eda we will perform pre-processing on text based on problem statement after knowing quality of the data
147
+ ''')