satya11 commited on
Commit
a539a5e
·
verified ·
1 Parent(s): e51b653

Update pages/4.Simple EDA.py

Browse files
Files changed (1) hide show
  1. pages/4.Simple EDA.py +139 -200
pages/4.Simple EDA.py CHANGED
@@ -1,262 +1,201 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import re
4
- import emoji
5
- from io import StringIO
6
 
7
  st.markdown("""
8
  <style>
9
- /* Set a soft background color */
10
  body {
11
- background-color: #eef2f7;
 
12
  }
13
- /* Style for main title */
 
14
  h1 {
15
- color: black;
16
- font-family: 'Roboto', sans-serif;
17
  font-weight: 700;
18
  text-align: center;
19
  margin-bottom: 25px;
 
 
20
  }
21
- /* Style for headers */
 
22
  h2 {
23
- color: black;
24
- font-family: 'Roboto', sans-serif;
25
  font-weight: 600;
26
  margin-top: 30px;
 
 
27
  }
28
 
29
- /* Style for subheaders */
30
- h3 {
31
- color: red;
32
- font-family: 'Roboto', sans-serif;
33
  font-weight: 500;
34
  margin-top: 20px;
35
  }
36
- .custom-subheader {
37
- color: black;
38
- font-family: 'Roboto', sans-serif;
39
- font-weight: 600;
40
- margin-bottom: 15px;
41
- }
42
- /* Paragraph styling */
43
- p {
44
  font-family: 'Georgia', serif;
45
  line-height: 1.8;
46
- color: black;
47
  margin-bottom: 20px;
48
  }
49
- /* List styling with checkmark bullets */
50
- .icon-bullet {
 
51
  list-style-type: none;
52
  padding-left: 20px;
53
  }
54
- .icon-bullet li {
 
55
  font-family: 'Georgia', serif;
56
  font-size: 1.1em;
57
  margin-bottom: 10px;
58
- color: black;
 
 
59
  }
60
- .icon-bullet li::before {
61
- content: "◆";
62
- padding-right: 10px;
63
- color: black;
 
 
 
64
  }
 
65
  /* Sidebar styling */
66
  .sidebar .sidebar-content {
67
  background-color: #ffffff;
68
- border-radius: 10px;
69
  padding: 15px;
 
70
  }
71
- .sidebar h2 {
72
- color: #495057;
 
 
 
 
 
73
  }
74
- /* Custom button style */
75
- .streamlit-button {
76
- background-color: #00FFFF;
77
- color: #000000;
78
- font-weight: bold;
 
 
79
  }
80
- .eda-result {
81
- background-color: #f8f9fa;
82
- border-radius: 5px;
 
 
83
  padding: 15px;
84
- margin: 10px 0;
85
- border-left: 4px solid #6c757d;
86
  }
87
  </style>
88
  """, unsafe_allow_html=True)
89
 
90
- st.header(":red[📊 Advanced Text EDA Tool 💬]")
91
 
92
- # Introduction to Simple EDA
93
- st.markdown("<div class='section'>", unsafe_allow_html=True)
94
- st.markdown("<h2 class='title'>🔍 Comprehensive Text Analysis</h2>", unsafe_allow_html=True)
95
- st.markdown("<p class='subtitle'>Evaluate raw text data quality with detailed metrics</p>", unsafe_allow_html=True)
 
 
 
96
 
97
- st.info("""
98
- 📌 **Key Benefits of Text EDA:**
99
- - Ensures raw data quality before processing
100
- - Identifies text patterns and special characters
101
- - Helps determine necessary preprocessing steps
102
- - Not dependent on specific problem statements
103
- """)
 
 
104
 
105
- st.markdown("</div>", unsafe_allow_html=True)
 
 
 
 
 
106
 
107
- # File upload section
108
- st.subheader(":violet[📂 Upload Your Data]")
109
- uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
 
 
 
 
 
 
 
 
110
 
111
- if uploaded_file is not None:
112
- # Read the uploaded file
113
- df = pd.read_csv(uploaded_file)
114
-
115
- # Show dataframe
116
- st.subheader("📊 Data Preview")
117
- st.dataframe(df.head())
118
-
119
- # Select text column
120
- text_column = st.selectbox("Select the text column to analyze", df.columns)
121
-
122
- # Analysis parameters
123
- st.subheader("⚙️ Analysis Parameters")
124
- sample_size = st.slider("Sample size (0 for full dataset)", 0, len(df), min(500, len(df)))
125
- analyze_button = st.button("Run Text Analysis", type="primary")
126
-
127
- if analyze_button:
128
- st.subheader("📈 Analysis Results")
129
-
130
- # Get sample if requested
131
- if sample_size > 0:
132
- df_sample = df.sample(min(sample_size, len(df)))
133
- else:
134
- df_sample = df.copy()
135
-
136
- # Define analysis functions
137
- def has_mixed_case(text):
138
- return not (text.islower() or text.isupper())
139
-
140
- def has_html_tags(text):
141
- return bool(re.search("<.*?>", str(text)))
142
-
143
- def has_urls(text):
144
- return bool(re.search("https?://\S+|www\.\S+", str(text)))
145
-
146
- def has_emails(text):
147
- return bool(re.search("\S+@\S+", str(text)))
148
-
149
- def has_mentions(text):
150
- return bool(re.search("\B[@#]\S+", str(text)))
151
-
152
- def has_emojis(text):
153
- return emoji.emoji_count(str(text)) > 0
154
-
155
- def has_digits(text):
156
- return bool(re.search("\d", str(text)))
157
-
158
- def has_punctuation(text):
159
- return bool(re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', str(text)))
160
-
161
- def has_dates(text):
162
- return bool(re.search(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b", str(text)))
163
-
164
- # Calculate metrics
165
- results = {
166
- "Mixed Case": df_sample[text_column].apply(has_mixed_case).sum(),
167
- "HTML Tags": df_sample[text_column].apply(has_html_tags).sum(),
168
- "URLs": df_sample[text_column].apply(has_urls).sum(),
169
- "Email Addresses": df_sample[text_column].apply(has_emails).sum(),
170
- "Mentions/Hashtags": df_sample[text_column].apply(has_mentions).sum(),
171
- "Emojis": df_sample[text_column].apply(has_emojis).sum(),
172
- "Digits": df_sample[text_column].apply(has_digits).sum(),
173
- "Punctuation": df_sample[text_column].apply(has_punctuation).sum(),
174
- "Date Formats": df_sample[text_column].apply(has_dates).sum()
175
- }
176
-
177
- # Display results
178
- total_texts = len(df_sample)
179
-
180
- for feature, count in results.items():
181
- percentage = (count / total_texts) * 100
182
- st.markdown(f"""
183
- <div class="eda-result">
184
- <h4>{feature}</h4>
185
- <p><strong>{count}</strong> texts contain this feature ({percentage:.1f}% of sample)</p>
186
- </div>
187
- """, unsafe_allow_html=True)
188
-
189
- # Show sample examples
190
- st.subheader("🔍 Sample Examples")
191
-
192
- for feature, count in results.items():
193
- if count > 0:
194
- st.write(f"**Examples with {feature}:**")
195
- examples = df_sample[df_sample[text_column].apply(locals()[f"has_{feature.lower().replace(' ', '_').replace('/', '_')}"])][text_column].head(3).tolist()
196
- for example in examples:
197
- st.code(example, language='text')
198
- st.write("")
199
 
200
- else:
201
- st.subheader(":violet[📃 Text Analysis Features]")
202
- st.markdown("""
203
- **Check Text Case** – Identify if text is in lowercase, uppercase, or mixed case
204
- **Detect HTML & URL Tags** Analyze if text contains unwanted elements
205
- ✅ **Identify URLs** – Find web links in the text
206
- **Detect Email Addresses** – Locate email patterns
207
- ✅ **Find Mentions & Hashtags** – Identify @mentions or #hashtags
208
- ✅ **Analyze Emoji Usage** – Count emoji occurrences
209
- ✅ **Identify Numeric Data** – Detect digits or numerical data
210
- ✅ **Check Punctuation** – Analyze punctuation usage
211
- ✅ **Find Date Formats** – Identify date/time patterns
212
- """)
213
-
214
- st.success("🚀 Upload a CSV file to begin your text analysis!")
215
 
216
- # Code display section
217
- st.subheader(":violet[💻 Analysis Code]")
218
  st.code('''
 
 
219
 
220
- def text_analysis(data, text_column):
221
- """Comprehensive text analysis function"""
222
  results = {}
223
 
224
- # Case analysis
225
- results['mixed_case'] = data[text_column].apply(
226
- lambda x: not (str(x).islower() or str(x).isupper())
227
- ).sum()
228
 
229
- # Special patterns
230
- patterns = {
231
- 'html_tags': r"<.*?>",
232
- 'urls': r"https?://\S+|www\.\S+",
233
- 'emails': r"\S+@\S+",
234
- 'mentions': r"\B[@#]\S+",
235
- 'digits': r"\d",
236
- 'punctuation': r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',
237
- 'dates': r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b"
238
- }
239
 
240
- for name, pattern in patterns.items():
241
- results[name] = data[text_column].apply(
242
- lambda x: bool(re.search(pattern, str(x)))
243
- ).sum()
244
 
245
- # Emoji analysis
246
- results['emojis'] = data[text_column].apply(
247
- lambda x: emoji.emoji_count(str(x)) > 0
248
- ).sum()
249
 
250
- return results
 
 
 
 
 
 
 
 
 
251
  ''', language='python')
252
 
253
  st.markdown("""
254
- ### How to Use This Analysis:
255
- 1. **Upload** your CSV file containing text data
256
- 2. **Select** the text column to analyze
257
- 3. **Choose** a sample size (or use full dataset)
258
- 4. **Run** the analysis to get detailed metrics
259
- 5. **Review** the results to determine necessary preprocessing steps
260
- """)
261
-
262
-
 
1
  import streamlit as st
2
  import pandas as pd
 
 
 
3
 
4
  st.markdown("""
5
  <style>
6
+ /* Main background and font settings */
7
  body {
8
+ background-color: #f8f9fa;
9
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
10
  }
11
+
12
+ /* Main title styling */
13
  h1 {
14
+ color: #2c3e50;
15
+ font-family: 'Arial', sans-serif;
16
  font-weight: 700;
17
  text-align: center;
18
  margin-bottom: 25px;
19
+ border-bottom: 2px solid #3498db;
20
+ padding-bottom: 10px;
21
  }
22
+
23
+ /* Header styling */
24
  h2 {
25
+ color: #2c3e50;
26
+ font-family: 'Arial', sans-serif;
27
  font-weight: 600;
28
  margin-top: 30px;
29
+ border-left: 4px solid #3498db;
30
+ padding-left: 10px;
31
  }
32
 
33
+ /* Subheader styling */
34
+ h3 {
35
+ color: #2c3e50;
36
+ font-family: 'Arial', sans-serif;
37
  font-weight: 500;
38
  margin-top: 20px;
39
  }
40
+
41
+ /* Custom text styling */
42
+ .custom-text {
 
 
 
 
 
43
  font-family: 'Georgia', serif;
44
  line-height: 1.8;
45
+ color: #34495e;
46
  margin-bottom: 20px;
47
  }
48
+
49
+ /* List styling */
50
+ .custom-list {
51
  list-style-type: none;
52
  padding-left: 20px;
53
  }
54
+
55
+ .custom-list li {
56
  font-family: 'Georgia', serif;
57
  font-size: 1.1em;
58
  margin-bottom: 10px;
59
+ color: #34495e;
60
+ position: relative;
61
+ padding-left: 25px;
62
  }
63
+
64
+ .custom-list li::before {
65
+ content: "•";
66
+ color: #3498db;
67
+ font-weight: bold;
68
+ position: absolute;
69
+ left: 0;
70
  }
71
+
72
  /* Sidebar styling */
73
  .sidebar .sidebar-content {
74
  background-color: #ffffff;
75
+ border-radius: 8px;
76
  padding: 15px;
77
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
78
  }
79
+
80
+ /* Info box styling */
81
+ .stInfo {
82
+ background-color: #e8f4fc;
83
+ border-left: 4px solid #3498db;
84
+ padding: 15px;
85
+ border-radius: 0 4px 4px 0;
86
  }
87
+
88
+ /* Success box styling */
89
+ .stSuccess {
90
+ background-color: #e8f8f0;
91
+ border-left: 4px solid #2ecc71;
92
+ padding: 15px;
93
+ border-radius: 0 4px 4px 0;
94
  }
95
+
96
+ /* Code block styling */
97
+ .stCodeBlock {
98
+ background-color: #f5f5f5;
99
+ border-radius: 4px;
100
  padding: 15px;
101
+ border-left: 4px solid #95a5a6;
 
102
  }
103
  </style>
104
  """, unsafe_allow_html=True)
105
 
106
+ st.title("Text Data Quality Analysis")
107
 
108
+ # Introduction section
109
+ st.markdown("""
110
+ <div class='custom-text'>
111
+ <h2>Understanding Text Data Quality Analysis</h2>
112
+ <p>Evaluating raw text data quality before processing is a critical first step in any text analysis project.</p>
113
+ </div>
114
+ """, unsafe_allow_html=True)
115
 
116
+ st.markdown("""
117
+ <div class='stInfo'>
118
+ <strong>Text Data Quality Analysis is crucial because:</strong><br><br>
119
+ Ensures raw data quality before processing<br>
120
+ Helps identify potential issues early in the pipeline<br>
121
+ Provides insights for better data exploration<br>
122
+ • Is independent of the specific problem statement
123
+ </div>
124
+ """, unsafe_allow_html=True)
125
 
126
+ # Main analysis steps
127
+ st.markdown("""
128
+ <div class='custom-text'>
129
+ <h2>Key Text Data Quality Checks</h2>
130
+ </div>
131
+ """, unsafe_allow_html=True)
132
 
133
+ st.markdown("""
134
+ <ul class='custom-list'>
135
+ <li><strong>Check Text Case</strong> Identify if text is in lowercase, uppercase, or mixed case</li>
136
+ <li><strong>Detect HTML Tags</strong> – Analyze if text contains unwanted HTML elements</li>
137
+ <li><strong>Identify URLs</strong> – Check for web addresses that may need processing</li>
138
+ <li><strong>Detect Mentions & Hashtags</strong> – Find occurrences of @mentions or #hashtags</li>
139
+ <li><strong>Identify Numeric Data</strong> – Detect if text includes digits or numerical data</li>
140
+ <li><strong>Analyze Punctuation Usage</strong> – Check whether punctuation marks affect text clarity</li>
141
+ <li><strong>Analyze Date/Time Formats</strong> – Identify the presence of date/time-related text</li>
142
+ </ul>
143
+ """, unsafe_allow_html=True)
144
 
145
+ st.markdown("""
146
+ <div class='stSuccess'>
147
+ Performing thorough text data quality analysis ensures structured and high-quality text data, leading to better analysis and model performance.
148
+ </div>
149
+ """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ # Code example
152
+ st.markdown("""
153
+ <div class='custom-text'>
154
+ <h2>Implementation Example</h2>
155
+ <p>Here's a Python function to perform basic text data quality checks:</p>
156
+ </div>
157
+ """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
158
 
 
 
159
  st.code('''
160
+ import pandas as pd
161
+ import re
162
 
163
+ def text_quality_analysis(data, column):
164
+ # Initialize results dictionary
165
  results = {}
166
 
167
+ # Check for case variations
168
+ results['has_lowercase'] = data[column].str.contains('[a-z]').sum()
169
+ results['has_uppercase'] = data[column].str.contains('[A-Z]').sum()
 
170
 
171
+ # Check for HTML tags
172
+ results['has_html_tags'] = data[column].str.contains('<.*?>', regex=True).sum()
173
+
174
+ # Check for URLs
175
+ results['has_urls'] = data[column].str.contains('https?://\\S+', regex=True).sum()
 
 
 
 
 
176
 
177
+ # Check for email addresses
178
+ results['has_emails'] = data[column].str.contains('\\S+@\\S+', regex=True).sum()
 
 
179
 
180
+ # Check for mentions and hashtags
181
+ results['has_mentions'] = data[column].str.contains('@\\w+', regex=True).sum()
182
+ results['has_hashtags'] = data[column].str.contains('#\\w+', regex=True).sum()
 
183
 
184
+ # Check for digits
185
+ results['has_digits'] = data[column].str.contains('\\d', regex=True).sum()
186
+
187
+ # Check for punctuation
188
+ results['has_punctuation'] = data[column].str.contains('[!"#$%&\\\'()*+,-./:;<=>?@[\\\\]^_`{|}~]', regex=True).sum()
189
+
190
+ # Check for date formats (simple check)
191
+ results['has_dates'] = data[column].str.contains('\\d{1,2}/\\d{1,2}/\\d{2,4}', regex=True).sum()
192
+
193
+ return pd.DataFrame.from_dict(results, orient='index', columns=['Count'])
194
  ''', language='python')
195
 
196
  st.markdown("""
197
+ <div class='custom-text'>
198
+ <p>This function provides a comprehensive analysis of text data quality by checking for various common elements that might need special handling during preprocessing.</p>
199
+ <p>The results can help guide your data cleaning strategy based on the specific characteristics of your text data.</p>
200
+ </div>
201
+ """, unsafe_allow_html=True)