satya11 commited on
Commit
66617ac
Β·
verified Β·
1 Parent(s): a6e7558

Create 4.Simple_EDA.py

Browse files
Files changed (1) hide show
  1. pages/4.Simple_EDA.py +263 -0
pages/4.Simple_EDA.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import re
4
+ import emoji
5
+ from io import StringIO
6
+
7
+ st.markdown("""
8
+ <style>
9
+ /* Set a soft background color */
10
+ body {
11
+ background-color: #eef2f7;
12
+ }
13
+ /* Style for main title */
14
+ h1 {
15
+ color: black;
16
+ font-family: 'Roboto', sans-serif;
17
+ font-weight: 700;
18
+ text-align: center;
19
+ margin-bottom: 25px;
20
+ }
21
+ /* Style for headers */
22
+ h2 {
23
+ color: black;
24
+ font-family: 'Roboto', sans-serif;
25
+ font-weight: 600;
26
+ margin-top: 30px;
27
+ }
28
+
29
+ /* Style for subheaders */
30
+ h3 {
31
+ color: red;
32
+ font-family: 'Roboto', sans-serif;
33
+ font-weight: 500;
34
+ margin-top: 20px;
35
+ }
36
+ .custom-subheader {
37
+ color: black;
38
+ font-family: 'Roboto', sans-serif;
39
+ font-weight: 600;
40
+ margin-bottom: 15px;
41
+ }
42
+ /* Paragraph styling */
43
+ p {
44
+ font-family: 'Georgia', serif;
45
+ line-height: 1.8;
46
+ color: black;
47
+ margin-bottom: 20px;
48
+ }
49
+ /* List styling with checkmark bullets */
50
+ .icon-bullet {
51
+ list-style-type: none;
52
+ padding-left: 20px;
53
+ }
54
+ .icon-bullet li {
55
+ font-family: 'Georgia', serif;
56
+ font-size: 1.1em;
57
+ margin-bottom: 10px;
58
+ color: black;
59
+ }
60
+ .icon-bullet li::before {
61
+ content: "β—†";
62
+ padding-right: 10px;
63
+ color: black;
64
+ }
65
+ /* Sidebar styling */
66
+ .sidebar .sidebar-content {
67
+ background-color: #ffffff;
68
+ border-radius: 10px;
69
+ padding: 15px;
70
+ }
71
+ .sidebar h2 {
72
+ color: #495057;
73
+ }
74
+ /* Custom button style */
75
+ .streamlit-button {
76
+ background-color: #00FFFF;
77
+ color: #000000;
78
+ font-weight: bold;
79
+ }
80
+ .eda-result {
81
+ background-color: #f8f9fa;
82
+ border-radius: 5px;
83
+ padding: 15px;
84
+ margin: 10px 0;
85
+ border-left: 4px solid #6c757d;
86
+ }
87
+ </style>
88
+ """, unsafe_allow_html=True)
89
+
90
+ st.header(":red[πŸ“Š Advanced Text EDA Tool πŸ’¬]")
91
+
92
+ # Introduction to Simple EDA
93
+ st.markdown("<div class='section'>", unsafe_allow_html=True)
94
+ st.markdown("<h2 class='title'>πŸ” Comprehensive Text Analysis</h2>", unsafe_allow_html=True)
95
+ st.markdown("<p class='subtitle'>Evaluate raw text data quality with detailed metrics</p>", unsafe_allow_html=True)
96
+
97
+ st.info("""
98
+ πŸ“Œ **Key Benefits of Text EDA:**
99
+ - Ensures raw data quality before processing
100
+ - Identifies text patterns and special characters
101
+ - Helps determine necessary preprocessing steps
102
+ - Not dependent on specific problem statements
103
+ """)
104
+
105
+ st.markdown("</div>", unsafe_allow_html=True)
106
+
107
+ # File upload section
108
+ st.subheader(":violet[πŸ“‚ Upload Your Data]")
109
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
110
+
111
+ if uploaded_file is not None:
112
+ # Read the uploaded file
113
+ df = pd.read_csv(uploaded_file)
114
+
115
+ # Show dataframe
116
+ st.subheader("πŸ“Š Data Preview")
117
+ st.dataframe(df.head())
118
+
119
+ # Select text column
120
+ text_column = st.selectbox("Select the text column to analyze", df.columns)
121
+
122
+ # Analysis parameters
123
+ st.subheader("βš™οΈ Analysis Parameters")
124
+ sample_size = st.slider("Sample size (0 for full dataset)", 0, len(df), min(500, len(df)))
125
+ analyze_button = st.button("Run Text Analysis", type="primary")
126
+
127
+ if analyze_button:
128
+ st.subheader("πŸ“ˆ Analysis Results")
129
+
130
+ # Get sample if requested
131
+ if sample_size > 0:
132
+ df_sample = df.sample(min(sample_size, len(df)))
133
+ else:
134
+ df_sample = df.copy()
135
+
136
+ # Define analysis functions
137
+ def has_mixed_case(text):
138
+ return not (text.islower() or text.isupper())
139
+
140
+ def has_html_tags(text):
141
+ return bool(re.search("<.*?>", str(text)))
142
+
143
+ def has_urls(text):
144
+ return bool(re.search("https?://\S+|www\.\S+", str(text)))
145
+
146
+ def has_emails(text):
147
+ return bool(re.search("\S+@\S+", str(text)))
148
+
149
+ def has_mentions(text):
150
+ return bool(re.search("\B[@#]\S+", str(text)))
151
+
152
+ def has_emojis(text):
153
+ return emoji.emoji_count(str(text)) > 0
154
+
155
+ def has_digits(text):
156
+ return bool(re.search("\d", str(text)))
157
+
158
+ def has_punctuation(text):
159
+ return bool(re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', str(text)))
160
+
161
+ def has_dates(text):
162
+ return bool(re.search(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b", str(text)))
163
+
164
+ # Calculate metrics
165
+ results = {
166
+ "Mixed Case": df_sample[text_column].apply(has_mixed_case).sum(),
167
+ "HTML Tags": df_sample[text_column].apply(has_html_tags).sum(),
168
+ "URLs": df_sample[text_column].apply(has_urls).sum(),
169
+ "Email Addresses": df_sample[text_column].apply(has_emails).sum(),
170
+ "Mentions/Hashtags": df_sample[text_column].apply(has_mentions).sum(),
171
+ "Emojis": df_sample[text_column].apply(has_emojis).sum(),
172
+ "Digits": df_sample[text_column].apply(has_digits).sum(),
173
+ "Punctuation": df_sample[text_column].apply(has_punctuation).sum(),
174
+ "Date Formats": df_sample[text_column].apply(has_dates).sum()
175
+ }
176
+
177
+ # Display results
178
+ total_texts = len(df_sample)
179
+
180
+ for feature, count in results.items():
181
+ percentage = (count / total_texts) * 100
182
+ st.markdown(f"""
183
+ <div class="eda-result">
184
+ <h4>{feature}</h4>
185
+ <p><strong>{count}</strong> texts contain this feature ({percentage:.1f}% of sample)</p>
186
+ </div>
187
+ """, unsafe_allow_html=True)
188
+
189
+ # Show sample examples
190
+ st.subheader("πŸ” Sample Examples")
191
+
192
+ for feature, count in results.items():
193
+ if count > 0:
194
+ st.write(f"**Examples with {feature}:**")
195
+ examples = df_sample[df_sample[text_column].apply(locals()[f"has_{feature.lower().replace(' ', '_').replace('/', '_')}"])][text_column].head(3).tolist()
196
+ for example in examples:
197
+ st.code(example, language='text')
198
+ st.write("")
199
+
200
+ else:
201
+ st.subheader(":violet[πŸ“ƒ Text Analysis Features]")
202
+ st.markdown("""
203
+ βœ… **Check Text Case** – Identify if text is in lowercase, uppercase, or mixed case
204
+ βœ… **Detect HTML & URL Tags** – Analyze if text contains unwanted elements
205
+ βœ… **Identify URLs** – Find web links in the text
206
+ βœ… **Detect Email Addresses** – Locate email patterns
207
+ βœ… **Find Mentions & Hashtags** – Identify @mentions or #hashtags
208
+ βœ… **Analyze Emoji Usage** – Count emoji occurrences
209
+ βœ… **Identify Numeric Data** – Detect digits or numerical data
210
+ βœ… **Check Punctuation** – Analyze punctuation usage
211
+ βœ… **Find Date Formats** – Identify date/time patterns
212
+ """)
213
+
214
+ st.success("πŸš€ Upload a CSV file to begin your text analysis!")
215
+
216
+ # Code display section
217
+ st.subheader(":violet[πŸ’» Analysis Code]")
218
+ st.code('''
219
+ import pandas as pd
220
+ import re
221
+ import emoji
222
+
223
+ def text_analysis(data, text_column):
224
+ """Comprehensive text analysis function"""
225
+ results = {}
226
+
227
+ # Case analysis
228
+ results['mixed_case'] = data[text_column].apply(
229
+ lambda x: not (str(x).islower() or str(x).isupper())
230
+ ).sum()
231
+
232
+ # Special patterns
233
+ patterns = {
234
+ 'html_tags': r"<.*?>",
235
+ 'urls': r"https?://\S+|www\.\S+",
236
+ 'emails': r"\S+@\S+",
237
+ 'mentions': r"\B[@#]\S+",
238
+ 'digits': r"\d",
239
+ 'punctuation': r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',
240
+ 'dates': r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b"
241
+ }
242
+
243
+ for name, pattern in patterns.items():
244
+ results[name] = data[text_column].apply(
245
+ lambda x: bool(re.search(pattern, str(x)))
246
+ ).sum()
247
+
248
+ # Emoji analysis
249
+ results['emojis'] = data[text_column].apply(
250
+ lambda x: emoji.emoji_count(str(x)) > 0
251
+ ).sum()
252
+
253
+ return results
254
+ ''', language='python')
255
+
256
+ st.markdown("""
257
+ ### How to Use This Analysis:
258
+ 1. **Upload** your CSV file containing text data
259
+ 2. **Select** the text column to analyze
260
+ 3. **Choose** a sample size (or use full dataset)
261
+ 4. **Run** the analysis to get detailed metrics
262
+ 5. **Review** the results to determine necessary preprocessing steps
263
+ """)