entropy25 commited on
Commit
78b8458
ยท
verified ยท
1 Parent(s): 6a3c971

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -317
app.py CHANGED
@@ -1,44 +1,17 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import os
4
  from data_handler import load_data
5
  from analyzer import DataAnalysisWorkflow, AIAssistant
6
 
7
- # HuggingFace specific configurations
8
- def configure_for_hf():
9
- """Configure app for HuggingFace Spaces deployment"""
10
  st.set_page_config(
11
- page_title="AI Data Quality Inspector",
12
- page_icon="๐Ÿ”",
13
- layout="wide",
14
- initial_sidebar_state="expanded"
15
  )
16
 
17
- # Add custom CSS for better mobile experience
18
- st.markdown("""
19
- <style>
20
- .main > div {
21
- padding-top: 2rem;
22
- }
23
- .stMetric {
24
- background-color: #f0f2f6;
25
- padding: 1rem;
26
- border-radius: 0.5rem;
27
- border: 1px solid #e6e9ef;
28
- }
29
- </style>
30
- """, unsafe_allow_html=True)
31
-
32
- def main():
33
- configure_for_hf()
34
-
35
- # Header with improved styling
36
- st.title("๐Ÿ” AI Data Quality Inspector")
37
- st.markdown("""
38
- **Upload โ†’ Inspect โ†’ Repair โ†’ Download** | *Transform messy data into clean datasets in 3 minutes*
39
-
40
- ๐ŸŽฏ **Features**: Visual quality scoring โ€ข AI repair suggestions โ€ข Interactive problem detection โ€ข One-click fixes
41
- """)
42
 
43
  # Initialize session state
44
  if 'current_stage' not in st.session_state:
@@ -48,309 +21,111 @@ def main():
48
  if 'ai_assistant' not in st.session_state:
49
  st.session_state.ai_assistant = AIAssistant()
50
 
51
- # File upload with enhanced UI
52
- st.markdown("### ๐Ÿ“ Upload Dataset")
53
- uploaded_file = st.file_uploader(
54
- "Choose CSV or Excel file",
55
- type=['csv', 'xlsx', 'xls'],
56
- help="Supports files up to 200MB. CSV files with UTF-8, Latin-1, or CP1252 encoding."
57
- )
58
-
59
- # Sample data option for demo
60
- col1, col2 = st.columns([3, 1])
61
- with col2:
62
- if st.button("๐ŸŽฎ Try Sample Data", use_container_width=True):
63
- # Create sample problematic dataset
64
- import numpy as np
65
- np.random.seed(42)
66
-
67
- sample_data = {
68
- 'customer_id': range(1, 1001),
69
- 'age': np.random.normal(35, 10, 1000),
70
- 'income': np.random.normal(50000, 15000, 1000),
71
- 'score': np.random.normal(75, 20, 1000),
72
- 'category': np.random.choice(['Premium', 'Standard', 'Basic', None], 1000, p=[0.3, 0.4, 0.2, 0.1]),
73
- 'region': np.random.choice(['North', 'South', 'East', 'West'], 1000)
74
- }
75
-
76
- df = pd.DataFrame(sample_data)
77
-
78
- # Inject quality issues for demonstration
79
- missing_indices = np.random.choice(df.index, 150, replace=False)
80
- df.loc[missing_indices, 'income'] = None
81
-
82
- outlier_indices = np.random.choice(df.index, 50, replace=False)
83
- df.loc[outlier_indices, 'age'] = np.random.uniform(100, 150, 50)
84
-
85
- df = pd.concat([df, df.head(25)]) # Add duplicates
86
-
87
- st.session_state.sample_data = df
88
- st.success("โœ… Sample data loaded! Continue below to analyze.")
89
 
90
- # Handle file upload or sample data
91
- df = None
92
  if uploaded_file is not None:
93
  try:
94
- with st.spinner("๐Ÿ”„ Loading and analyzing dataset..."):
95
- df = load_data(uploaded_file)
96
- st.success(f"โœ… Dataset loaded successfully! Shape: {df.shape[0]:,} rows ร— {df.shape[1]} columns")
97
- except Exception as e:
98
- st.error(f"โŒ Error loading file: {str(e)}")
99
- st.info("Please check your file format. Supported: CSV (UTF-8, Latin-1), Excel (.xlsx, .xls)")
100
- return
101
- elif 'sample_data' in st.session_state:
102
- df = st.session_state.sample_data
103
- st.info("๐Ÿ“Š Using sample dataset for demonstration")
104
-
105
- if df is not None:
106
- # Initialize or update workflow
107
- if (st.session_state.workflow is None or
108
- st.session_state.workflow.df.shape != df.shape):
109
- st.session_state.workflow = DataAnalysisWorkflow(df)
110
-
111
- workflow = st.session_state.workflow
112
-
113
- # Enhanced sidebar with progress
114
- with st.sidebar:
115
- st.header("๐ŸŽฏ Analysis Progress")
116
 
117
- # Progress indicator - Fixed to prevent values > 1.0
118
- progress_value = min(st.session_state.current_stage / 5, 1.0)
119
- st.progress(progress_value)
120
- st.write(f"Stage {st.session_state.current_stage} of 5")
121
 
122
- # Stage navigation
123
- stages = [
124
- ("๐Ÿ“Š", "Data Overview", "Get instant quality insights"),
125
- ("๐Ÿ”", "Exploration", "Discover patterns and relationships"),
126
- ("๐Ÿงน", "Quality Check", "Detect and fix data issues"),
127
- ("๐Ÿ”ฌ", "Analysis", "Advanced statistical analysis"),
128
- ("๐Ÿ“ˆ", "Summary", "Export results and reports")
129
- ]
130
 
131
- st.markdown("### ๐Ÿ“‹ Stages")
132
- for i, (icon, name, desc) in enumerate(stages, 1):
133
  if i == st.session_state.current_stage:
134
- st.markdown(f"**{icon} {i}. {name}** ๐Ÿ”„")
135
- st.caption(desc)
136
  elif i < st.session_state.current_stage:
137
- st.markdown(f"โœ… {icon} {i}. {name}")
138
  else:
139
- st.markdown(f"โณ {icon} {i}. {name}")
140
 
141
- # Navigation buttons
142
- col1, col2 = st.columns(2)
143
  with col1:
144
- if st.button("โ† Previous", disabled=st.session_state.current_stage <= 1):
145
  st.session_state.current_stage -= 1
146
  st.rerun()
147
  with col2:
148
- if st.button("Next โ†’", disabled=st.session_state.current_stage >= 5):
149
  st.session_state.current_stage += 1
150
  st.rerun()
151
 
152
- # Quick insights panel
153
- if workflow.insights:
154
- st.markdown("### ๐Ÿ’ก Latest Insights")
155
- recent_insights = workflow.insights[-3:]
156
- for insight in recent_insights:
157
- with st.expander(f"Stage {insight['stage']}", expanded=False):
158
- st.write(insight['insight'])
159
-
160
- # Quick stats
161
- st.markdown("### ๐Ÿ“Š Quick Stats")
162
- st.metric("Data Quality", "Calculating..." if not workflow.insights else "Good")
163
- st.metric("Issues Found", len([i for i in workflow.insights if 'issue' in i['insight'].lower()]))
164
- st.metric("Memory Usage", f"{workflow.stats['memory_usage']:.1f} MB")
165
-
166
- # Main content area with enhanced layout
167
- st.markdown("---")
168
-
169
- # Stage execution with improved styling
170
- if st.session_state.current_stage == 1:
171
- st.markdown("## ๐Ÿ“Š Data Overview & Quality Assessment")
172
- workflow.stage_1_overview()
173
-
174
- elif st.session_state.current_stage == 2:
175
- st.markdown("## ๐Ÿ” Exploratory Data Analysis")
176
- workflow.stage_2_exploration()
177
-
178
- elif st.session_state.current_stage == 3:
179
- st.markdown("## ๐Ÿงน Data Quality Check & Repair")
180
- workflow.stage_3_cleaning()
181
-
182
- elif st.session_state.current_stage == 4:
183
- st.markdown("## ๐Ÿ”ฌ Advanced Statistical Analysis")
184
- workflow.stage_4_analysis()
185
-
186
- elif st.session_state.current_stage == 5:
187
- st.markdown("## ๐Ÿ“ˆ Summary & Export")
188
- workflow.stage_5_summary()
189
-
190
- # AI Assistant panel (enhanced for HF)
191
- with st.expander("๐Ÿค– AI Assistant", expanded=False):
192
- st.markdown("### AI-Powered Data Insights")
193
-
194
- # Show available AI features (mock for HF deployment)
195
- st.info("๐Ÿ’ก **AI Features Available:**\n- Automated quality scoring\n- Smart repair suggestions\n- Business impact analysis\n- Pattern recognition")
196
-
197
- if st.button("๐Ÿง  Generate AI Analysis"):
198
- if workflow.insights:
199
- with st.spinner("๐Ÿค– AI analyzing your data..."):
200
- # Simulate AI analysis with built-in intelligence
201
- ai_insights = generate_builtin_ai_analysis(workflow.df, workflow.insights)
202
-
203
- st.markdown("**๐ŸŽฏ AI Analysis Results:**")
204
- for category, insight in ai_insights.items():
205
- with st.expander(f"๐Ÿ“‹ {category}", expanded=True):
206
- st.write(insight)
207
  else:
208
- st.warning("Complete some analysis stages first to get AI insights.")
209
-
210
- else:
211
- # Enhanced landing page
212
- st.markdown("""
213
- ## ๐Ÿš€ Welcome to AI Data Quality Inspector
214
-
215
- Transform your messy datasets into analysis-ready data in just **3 minutes**!
216
-
217
- ### โœจ What You Get:
218
- """)
219
-
220
- col1, col2, col3 = st.columns(3)
 
 
 
 
 
 
 
 
 
221
 
222
- with col1:
223
- st.markdown("""
224
- **๐Ÿ” Instant Detection**
225
- - Visual quality scoring
226
- - Missing value heatmaps
227
- - Outlier identification
228
- - Duplicate detection
229
- """)
230
-
231
- with col2:
232
- st.markdown("""
233
- **๐Ÿค– AI Guidance**
234
- - Smart repair suggestions
235
- - Business impact analysis
236
- - Confidence scoring
237
- - One-click fixes
238
- """)
239
-
240
- with col3:
241
- st.markdown("""
242
- **๐Ÿ“Š Professional Results**
243
- - Clean datasets
244
- - Quality reports
245
- - Visual comparisons
246
- - Export options
247
- """)
248
-
249
- st.markdown("""
250
- ### ๐ŸŽฏ Perfect For:
251
- - **Business Analysts**: Validate data before reporting
252
- - **Data Engineers**: Pre-import quality checks
253
- - **Operations Teams**: Non-technical data assessment
254
-
255
- ### ๐Ÿš€ Get Started:
256
- 1. Upload your CSV or Excel file above
257
- 2. Navigate through the 5-stage analysis workflow
258
- 3. Apply AI-suggested repairs with one click
259
- 4. Download your cleaned dataset and quality report
260
- """)
261
-
262
- def generate_builtin_ai_analysis(df: pd.DataFrame, insights: list) -> dict:
263
- """Generate AI-style analysis without external APIs"""
264
-
265
- analysis = {}
266
-
267
- # Data Quality Assessment
268
- missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
269
- duplicate_pct = (df.duplicated().sum() / len(df)) * 100
270
-
271
- if missing_pct > 10:
272
- analysis["๐Ÿšจ Data Completeness"] = f"""
273
- **Issue**: {missing_pct:.1f}% of your data is missing, which could significantly impact analysis reliability.
274
-
275
- **Business Impact**: Missing data can lead to biased insights and incorrect business decisions.
276
-
277
- **Recommendation**: Focus on columns with >20% missing values - consider external data sources or business process improvements.
278
- """
279
- elif missing_pct > 0:
280
- analysis["โœ… Data Completeness"] = f"""
281
- **Status**: Only {missing_pct:.1f}% missing data - within acceptable limits.
282
-
283
- **Recommendation**: Apply median/mode filling for remaining gaps before analysis.
284
- """
285
-
286
- # Outlier Analysis
287
- numeric_cols = df.select_dtypes(include=['number']).columns
288
- total_outliers = 0
289
-
290
- for col in numeric_cols:
291
- Q1 = df[col].quantile(0.25)
292
- Q3 = df[col].quantile(0.75)
293
- IQR = Q3 - Q1
294
- outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
295
- total_outliers += len(outliers)
296
-
297
- if total_outliers > len(df) * 0.05: # More than 5% outliers
298
- analysis["โš ๏ธ Statistical Anomalies"] = f"""
299
- **Issue**: {total_outliers} outliers detected ({(total_outliers/len(df)*100):.1f}% of data).
300
-
301
- **Possible Causes**: Data entry errors, system glitches, or genuine extreme values.
302
-
303
- **Recommendation**: Investigate business context before removing. Consider capping instead of deletion.
304
- """
305
-
306
- # Data Distribution Analysis
307
- if len(numeric_cols) > 1:
308
- corr_matrix = df[numeric_cols].corr()
309
- max_corr = corr_matrix.abs().unstack().sort_values(ascending=False).iloc[1] # Exclude self-correlation
310
-
311
- if max_corr > 0.8:
312
- analysis["๐Ÿ”— Strong Correlations"] = f"""
313
- **Finding**: Strong correlation detected (r={max_corr:.3f}) between variables.
314
-
315
- **Opportunity**: This suggests predictable relationships in your data - valuable for forecasting and modeling.
316
-
317
- **Next Steps**: Use correlated variables for predictive analysis or data validation.
318
- """
319
-
320
- # Memory and Performance
321
- memory_mb = df.memory_usage(deep=True).sum() / 1024**2
322
- if memory_mb > 100:
323
- analysis["๐Ÿ”ง Performance Optimization"] = f"""
324
- **Status**: Dataset uses {memory_mb:.1f}MB memory.
325
-
326
- **Optimization**: Convert categorical columns to 'category' dtype to reduce memory usage by up to 50%.
327
-
328
- **Benefit**: Faster processing and lower resource consumption.
329
- """
330
-
331
- # Business Insights
332
- categorical_cols = df.select_dtypes(include=['object']).columns
333
- if len(categorical_cols) > 0:
334
- high_cardinality_cols = [col for col in categorical_cols if df[col].nunique() > len(df) * 0.5]
335
- if high_cardinality_cols:
336
- analysis["๐Ÿ“ˆ Business Intelligence"] = f"""
337
- **Finding**: High-cardinality columns detected: {', '.join(high_cardinality_cols)}.
338
-
339
- **Insight**: These might be customer IDs or transaction codes - valuable for tracking but not for grouping analysis.
340
-
341
- **Strategy**: Use for joins and lookups, avoid in statistical summaries.
342
- """
343
-
344
- if not analysis:
345
- analysis["๐ŸŽ‰ Excellent Data Quality"] = """
346
- **Status**: Your dataset shows excellent quality metrics across all dimensions.
347
-
348
- **Ready for**: Advanced analytics, machine learning, and business intelligence applications.
349
-
350
- **Next Steps**: Proceed with confidence to your analytical objectives.
351
- """
352
-
353
- return analysis
354
 
355
  if __name__ == "__main__":
356
  main()
 
1
  import streamlit as st
2
  import pandas as pd
 
3
  from data_handler import load_data
4
  from analyzer import DataAnalysisWorkflow, AIAssistant
5
 
6
+ def main():
 
 
7
  st.set_page_config(
8
+ page_title="Data Analysis Platform",
9
+ page_icon="๐Ÿ“Š",
10
+ layout="wide"
 
11
  )
12
 
13
+ st.title("๐Ÿ“Š Data Analysis Platform")
14
+ st.markdown("**Optimized workflow with caching and pagination**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Initialize session state
17
  if 'current_stage' not in st.session_state:
 
21
  if 'ai_assistant' not in st.session_state:
22
  st.session_state.ai_assistant = AIAssistant()
23
 
24
+ # File upload
25
+ uploaded_file = st.file_uploader("Upload Dataset", type=['csv', 'xlsx'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
 
 
27
  if uploaded_file is not None:
28
  try:
29
+ # Load data
30
+ df = load_data(uploaded_file)
31
+ st.success(f"โœ… Dataset loaded! Shape: {df.shape}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # Initialize workflow
34
+ if st.session_state.workflow is None:
35
+ st.session_state.workflow = DataAnalysisWorkflow(df)
 
36
 
37
+ # Progress sidebar
38
+ st.sidebar.header("Progress")
39
+ progress = st.sidebar.progress(st.session_state.current_stage / 5)
 
 
 
 
 
40
 
41
+ stages = ["Data Overview", "Exploration", "Quality Check", "Analysis", "Summary"]
42
+ for i, stage in enumerate(stages, 1):
43
  if i == st.session_state.current_stage:
44
+ st.sidebar.write(f"๐Ÿ”„ **{i}. {stage}**")
 
45
  elif i < st.session_state.current_stage:
46
+ st.sidebar.write(f"โœ… {i}. {stage}")
47
  else:
48
+ st.sidebar.write(f"โณ {i}. {stage}")
49
 
50
+ # Navigation
51
+ col1, col2 = st.sidebar.columns(2)
52
  with col1:
53
+ if st.button("โ† Previous") and st.session_state.current_stage > 1:
54
  st.session_state.current_stage -= 1
55
  st.rerun()
56
  with col2:
57
+ if st.button("Next โ†’") and st.session_state.current_stage < 5:
58
  st.session_state.current_stage += 1
59
  st.rerun()
60
 
61
+ # Recent insights
62
+ st.sidebar.header("๐Ÿ’ก Recent Insights")
63
+ recent_insights = st.session_state.workflow.insights[-3:]
64
+ for insight in recent_insights:
65
+ st.sidebar.info(f"**Stage {insight['stage']}:** {insight['insight']}")
66
+
67
+ # Main content with AI assistant
68
+ main_col, ai_col = st.columns([3, 1])
69
+
70
+ with main_col:
71
+ # Execute current stage
72
+ if st.session_state.current_stage == 1:
73
+ st.session_state.workflow.stage_1_overview()
74
+ elif st.session_state.current_stage == 2:
75
+ st.session_state.workflow.stage_2_exploration()
76
+ elif st.session_state.current_stage == 3:
77
+ st.session_state.workflow.stage_3_cleaning()
78
+ elif st.session_state.current_stage == 4:
79
+ st.session_state.workflow.stage_4_analysis()
80
+ elif st.session_state.current_stage == 5:
81
+ st.session_state.workflow.stage_5_summary()
82
+
83
+ with ai_col:
84
+ st.subheader("๐Ÿค– AI Assistant")
85
+
86
+ # AI model selection
87
+ available_models = st.session_state.ai_assistant.get_available_models()
88
+
89
+ if available_models:
90
+ selected_model = st.selectbox("AI Model:", available_models)
91
+
92
+ if st.button("Get AI Insights"):
93
+ if st.session_state.workflow.insights:
94
+ with st.spinner("Analyzing with AI..."):
95
+ ai_analysis = st.session_state.ai_assistant.analyze_insights(
96
+ df, st.session_state.workflow.insights, selected_model
97
+ )
98
+ st.write("**AI Analysis:**")
99
+ st.write(ai_analysis)
100
+ else:
101
+ st.warning("Complete some analysis stages first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  else:
103
+ st.warning("No AI models available.")
104
+ st.info("Set GOOGLE_API_KEY or OPENAI_API_KEY environment variables.")
105
+
106
+ # Quick insights
107
+ st.subheader("๐Ÿ“Š Quick Stats")
108
+ if st.session_state.workflow.insights:
109
+ st.metric("Total Insights", len(st.session_state.workflow.insights))
110
+ st.metric("Current Stage", f"{st.session_state.current_stage}/5")
111
+
112
+ # Latest insight
113
+ if st.session_state.workflow.insights:
114
+ latest = st.session_state.workflow.insights[-1]
115
+ st.info(f"**Latest:** {latest['insight']}")
116
+
117
+ # Data quality indicator
118
+ quality_score = 100
119
+ if st.session_state.workflow.stats['missing_values'] > 0:
120
+ quality_score -= 30
121
+ if st.session_state.workflow.stats['duplicates'] > 0:
122
+ quality_score -= 20
123
+
124
+ st.metric("Data Quality", f"{quality_score}%")
125
 
126
+ except Exception as e:
127
+ st.error(f"Error: {str(e)}")
128
+ st.info("Please check your file format and try again.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  if __name__ == "__main__":
131
  main()