entropy25 commited on
Commit
2fad68d
·
verified ·
1 Parent(s): c50f214

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -366
app.py CHANGED
@@ -1,395 +1,131 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import logging
4
- from data_handler import load_data, validate_dataframe
5
  from analyzer import DataAnalysisWorkflow, AIAssistant
6
 
7
- # Configure logging
8
- logging.basicConfig(level=logging.INFO)
9
- logger = logging.getLogger(__name__)
10
-
11
- def initialize_session_state():
12
- """Initialize all session state variables"""
13
- defaults = {
14
- 'current_stage': 1,
15
- 'workflow': None,
16
- 'ai_assistant': None,
17
- 'show_help': False,
18
- 'analysis_complete': False,
19
- 'error_log': []
20
- }
21
-
22
- for key, value in defaults.items():
23
- if key not in st.session_state:
24
- st.session_state[key] = value
25
-
26
- def display_header():
27
- """Display application header"""
28
  st.set_page_config(
29
  page_title="Data Analysis Platform",
30
  page_icon="📊",
31
- layout="wide",
32
- initial_sidebar_state="expanded"
33
  )
34
 
35
  st.title("📊 Data Analysis Platform")
36
- st.markdown("**Professional data analysis workflow with AI assistance**")
37
-
38
- # Quick stats in header
39
- if st.session_state.workflow is not None:
40
- col1, col2, col3, col4 = st.columns(4)
41
- with col1:
42
- st.metric("📁 Rows", f"{st.session_state.workflow.df.shape[0]:,}")
43
- with col2:
44
- st.metric("📋 Columns", f"{st.session_state.workflow.df.shape[1]:,}")
45
- with col3:
46
- st.metric("🔍 Insights", len(st.session_state.workflow.insights))
47
- with col4:
48
- stage_progress = (st.session_state.current_stage / 5) * 100
49
- st.metric("📈 Progress", f"{stage_progress:.0f}%")
50
-
51
- def display_sidebar():
52
- """Sidebar with progress tracking and navigation"""
53
- st.sidebar.header("🗺️ Analysis Progress")
54
-
55
- progress_value = st.session_state.current_stage / 5
56
- st.sidebar.progress(progress_value)
57
-
58
- stages = [
59
- {"name": "Data Overview", "icon": "📊"},
60
- {"name": "Exploration", "icon": "🔍"},
61
- {"name": "Quality Check", "icon": "🧹"},
62
- {"name": "Analysis", "icon": "🔬"},
63
- {"name": "Summary", "icon": "📈"}
64
- ]
65
-
66
- st.sidebar.markdown("### 📋 Analysis Stages")
67
-
68
- for i, stage in enumerate(stages, 1):
69
- if i == st.session_state.current_stage:
70
- st.sidebar.markdown(f"🔄 **{i}. {stage['name']}**")
71
- elif i < st.session_state.current_stage:
72
- st.sidebar.markdown(f"✅ {i}. {stage['name']}")
73
- else:
74
- st.sidebar.markdown(f"⏳ {i}. {stage['name']}")
75
-
76
- # Navigation buttons
77
- st.sidebar.markdown("### 🧭 Navigation")
78
- col1, col2 = st.sidebar.columns(2)
79
-
80
- with col1:
81
- if st.button("⬅️ Previous", disabled=st.session_state.current_stage <= 1):
82
- st.session_state.current_stage -= 1
83
- st.rerun()
84
-
85
- with col2:
86
- if st.button("➡️ Next", disabled=st.session_state.current_stage >= 5):
87
- st.session_state.current_stage += 1
88
- st.rerun()
89
-
90
- # Error log
91
- if st.session_state.error_log:
92
- with st.sidebar.expander("⚠️ Error Log", expanded=False):
93
- for error in st.session_state.error_log[-5:]:
94
- st.error(error)
95
-
96
- def display_ai_assistant():
97
- """AI assistant panel"""
98
- st.subheader("🤖 AI Assistant")
99
-
100
- if st.session_state.ai_assistant is None:
101
  st.session_state.ai_assistant = AIAssistant()
102
 
103
- try:
104
- available_models = st.session_state.ai_assistant.get_available_models()
105
- except:
106
- available_models = []
107
-
108
- if available_models:
109
- selected_model = st.selectbox("AI Model:", available_models)
110
-
111
- if st.button("🧠 Get AI Insights", type="primary"):
112
- if st.session_state.workflow and st.session_state.workflow.insights:
113
- with st.spinner("🔮 AI is analyzing your data..."):
114
- try:
115
- ai_analysis = st.session_state.ai_assistant.analyze_insights(
116
- st.session_state.workflow.df,
117
- st.session_state.workflow.insights,
118
- selected_model
119
- )
120
-
121
- if ai_analysis and "Error" not in ai_analysis:
122
- st.markdown("### 🎯 AI Analysis Results")
123
- st.markdown(ai_analysis)
124
- st.session_state.workflow.add_insight("AI analysis completed",
125
- st.session_state.current_stage, "success")
126
- else:
127
- st.error("Failed to get AI analysis")
128
-
129
- except Exception as e:
130
- error_msg = f"AI analysis failed: {str(e)}"
131
- st.error(error_msg)
132
- st.session_state.error_log.append(error_msg)
133
- else:
134
- st.warning("⚠️ Complete some analysis stages first")
135
- else:
136
- st.warning("⚠️ No AI models available")
137
-
138
- def handle_file_upload():
139
- """File upload with validation and preview"""
140
- st.markdown("### 📁 Upload Your Dataset")
141
-
142
- uploaded_file = st.file_uploader(
143
- "Choose your data file",
144
- type=['csv', 'xlsx', 'xls'],
145
- help="Supported formats: CSV, Excel (.xlsx, .xls)"
146
- )
147
 
148
  if uploaded_file is not None:
149
- file_size = len(uploaded_file.getvalue()) / 1024**2
150
-
151
- col1, col2, col3 = st.columns(3)
152
- with col1:
153
- st.metric("📁 File Name", uploaded_file.name)
154
- with col2:
155
- st.metric("📊 File Size", f"{file_size:.1f} MB")
156
- with col3:
157
- file_type = uploaded_file.name.split('.')[-1].upper()
158
- st.metric("📋 Format", file_type)
159
-
160
- with st.spinner("🔄 Loading data..."):
161
- try:
162
- df = load_data(uploaded_file)
163
-
164
- if df is not None:
165
- is_valid, validation_issues = validate_dataframe(df)
166
-
167
- if is_valid:
168
- st.success(f"✅ **Dataset loaded!** Shape: {df.shape[0]:,} rows × {df.shape[1]:,} columns")
169
-
170
- with st.expander("👀 Data Preview", expanded=False):
171
- st.dataframe(df.head(), use_container_width=True)
172
-
173
- st.session_state.workflow = DataAnalysisWorkflow(df)
174
- st.session_state.current_stage = 1
175
- st.session_state.analysis_complete = False
176
-
177
- return True
178
-
179
- else:
180
- st.error("❌ **Data validation failed:**")
181
- for issue in validation_issues:
182
- st.write(f"• {issue}")
183
- return False
184
  else:
185
- st.error(" Failed to load data")
186
- return False
187
-
188
- except Exception as e:
189
- error_msg = f"Error loading file: {str(e)}"
190
- st.error(f"❌ {error_msg}")
191
- st.session_state.error_log.append(error_msg)
192
- return False
193
-
194
- return False
195
-
196
- def safe_execute_stage():
197
- """Execute current stage with comprehensive error handling"""
198
- try:
199
- workflow = st.session_state.workflow
200
- stage = st.session_state.current_stage
201
-
202
- if stage == 1:
203
- # Stage 1: Data Overview with safe execution
204
- st.subheader("📊 Data Overview")
205
-
206
- # Basic info
207
- col1, col2, col3 = st.columns(3)
208
- with col1:
209
- st.metric("Rows", f"{len(workflow.df):,}")
210
- with col2:
211
- st.metric("Columns", f"{len(workflow.df.columns):,}")
212
- with col3:
213
- memory_mb = workflow.df.memory_usage(deep=True).sum() / 1024**2
214
- st.metric("Memory", f"{memory_mb:.1f} MB")
215
 
216
- # Data types
217
- st.markdown("### Column Information")
218
- dtype_counts = workflow.df.dtypes.value_counts()
219
- for dtype, count in dtype_counts.items():
220
- st.write(f"• **{dtype}**: {count} columns")
221
-
222
- # Missing values
223
- missing_total = workflow.df.isnull().sum().sum()
224
- duplicates = workflow.df.duplicated().sum()
225
-
226
- col1, col2 = st.columns(2)
227
  with col1:
228
- st.metric("Missing Values", f"{missing_total:,}")
 
 
229
  with col2:
230
- st.metric("Duplicates", f"{duplicates:,}")
231
-
232
- # Quality score calculation (fixed)
233
- total_cells = len(workflow.df) * len(workflow.df.columns)
234
- missing_pct = (missing_total / total_cells) * 100 if total_cells > 0 else 0
235
- duplicate_pct = (duplicates / len(workflow.df)) * 100 if len(workflow.df) > 0 else 0
236
-
237
- quality_score = max(0, 100 - (missing_pct * 2) - (duplicate_pct * 3))
238
-
239
- if quality_score >= 90:
240
- st.success(f"🌟 Excellent Data Quality: {quality_score:.0f}%")
241
- elif quality_score >= 70:
242
- st.info(f"👍 Good Data Quality: {quality_score:.0f}%")
243
- else:
244
- st.warning(f"⚠️ Data Quality Needs Improvement: {quality_score:.0f}%")
245
 
246
- # Add insight to workflow
247
- workflow.add_insight(f"Data overview completed. Quality score: {quality_score:.0f}%",
248
- stage, "success")
 
 
249
 
250
- elif stage == 2:
251
- # Stage 2: Data Exploration
252
- st.subheader("🔍 Data Exploration")
253
-
254
- numeric_cols = workflow.df.select_dtypes(include=['number']).columns
255
- categorical_cols = workflow.df.select_dtypes(include=['object', 'category']).columns
256
-
257
- if len(numeric_cols) > 0:
258
- st.markdown("### Numeric Columns")
259
- st.dataframe(workflow.df[numeric_cols].describe(), use_container_width=True)
260
-
261
- if len(categorical_cols) > 0:
262
- st.markdown("### Categorical Columns")
263
- for col in categorical_cols[:5]: # Show first 5 categorical columns
264
- unique_count = workflow.df[col].nunique()
265
- st.write(f"**{col}**: {unique_count} unique values")
266
-
267
- workflow.add_insight("Data exploration completed", stage, "success")
268
-
269
- elif stage == 3:
270
- # Stage 3: Data Cleaning
271
- st.subheader("🧹 Data Quality Check")
272
-
273
- # Missing values by column
274
- missing_by_col = workflow.df.isnull().sum()
275
- missing_cols = missing_by_col[missing_by_col > 0]
276
-
277
- if len(missing_cols) > 0:
278
- st.markdown("### Missing Values by Column")
279
- for col, count in missing_cols.items():
280
- pct = (count / len(workflow.df)) * 100
281
- st.write(f"• **{col}**: {count} missing ({pct:.1f}%)")
282
- else:
283
- st.success("✅ No missing values found")
284
-
285
- # Duplicates
286
- duplicates = workflow.df.duplicated().sum()
287
- if duplicates > 0:
288
- st.warning(f"⚠️ Found {duplicates} duplicate rows")
289
- else:
290
- st.success("✅ No duplicate rows found")
291
-
292
- workflow.add_insight("Quality check completed", stage, "success")
293
-
294
- elif stage == 4:
295
- # Stage 4: Advanced Analysis
296
- st.subheader("🔬 Advanced Analysis")
297
-
298
- numeric_cols = workflow.df.select_dtypes(include=['number']).columns
299
-
300
- if len(numeric_cols) >= 2:
301
- st.markdown("### Correlation Matrix")
302
- corr_matrix = workflow.df[numeric_cols].corr()
303
- st.dataframe(corr_matrix, use_container_width=True)
304
-
305
- workflow.add_insight("Advanced analysis completed", stage, "success")
306
-
307
- elif stage == 5:
308
- # Stage 5: Summary
309
- st.subheader("📈 Analysis Summary")
310
-
311
- st.markdown("### Analysis Complete!")
312
- st.write(f"• Dataset: {len(workflow.df):,} rows × {len(workflow.df.columns):,} columns")
313
- st.write(f"• Insights generated: {len(workflow.insights)}")
314
- st.write(f"• Analysis stages completed: {st.session_state.current_stage}")
315
-
316
- # Export options
317
- st.markdown("### Export Options")
318
- if st.button("📊 Download CSV"):
319
- csv = workflow.df.to_csv(index=False)
320
- st.download_button(
321
- label="Download CSV",
322
- data=csv,
323
- file_name="analyzed_data.csv",
324
- mime="text/csv"
325
- )
326
-
327
- if not st.session_state.analysis_complete:
328
- st.session_state.analysis_complete = True
329
- st.balloons()
330
-
331
- workflow.add_insight("Analysis summary completed", stage, "success")
332
-
333
- # Initialize stats if not exists
334
- if not hasattr(workflow, 'stats'):
335
- workflow.stats = {
336
- 'missing_values': workflow.df.isnull().sum().sum(),
337
- 'duplicates': workflow.df.duplicated().sum()
338
- }
339
-
340
- except Exception as e:
341
- error_msg = f"Error in stage {st.session_state.current_stage}: {str(e)}"
342
- st.error(f"❌ {error_msg}")
343
- st.session_state.error_log.append(error_msg)
344
- logger.error(error_msg)
345
-
346
- # Show fallback content
347
- st.warning("⚠️ Analysis stage encountered an issue. Please try refreshing.")
348
-
349
- def main():
350
- """Main application"""
351
- try:
352
- initialize_session_state()
353
- display_header()
354
-
355
- data_loaded = handle_file_upload()
356
-
357
- if data_loaded and st.session_state.workflow is not None:
358
  main_col, ai_col = st.columns([3, 1])
359
 
360
  with main_col:
361
- safe_execute_stage()
 
 
 
 
 
 
 
 
 
 
362
 
363
  with ai_col:
364
- display_ai_assistant()
365
-
366
- display_sidebar()
367
-
368
- if st.session_state.analysis_complete:
369
- st.success("🎉 **Analysis Complete!**")
370
-
371
- elif not data_loaded:
372
- st.markdown("### 🚀 Welcome to Data Analysis Platform")
373
- st.markdown("""
374
- **Features:**
375
- - 5-stage analysis workflow
376
- - AI-powered insights
377
- - Data quality assessment
378
- - Interactive visualizations
379
- - Export capabilities
380
-
381
- **Supported Formats:** CSV, Excel (.xlsx, .xls)
382
- """)
383
-
384
- except Exception as e:
385
- error_msg = f"Application error: {str(e)}"
386
- st.error(f"❌ {error_msg}")
387
- st.session_state.error_log.append(error_msg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
- if st.button("🔄 Restart"):
390
- for key in list(st.session_state.keys()):
391
- del st.session_state[key]
392
- st.rerun()
393
 
394
  if __name__ == "__main__":
395
  main()
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from data_handler import load_data
 
4
  from analyzer import DataAnalysisWorkflow, AIAssistant
5
 
6
+ def main():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  st.set_page_config(
8
  page_title="Data Analysis Platform",
9
  page_icon="📊",
10
+ layout="wide"
 
11
  )
12
 
13
  st.title("📊 Data Analysis Platform")
14
+ st.markdown("**Optimized workflow with caching and pagination**")
15
+
16
+ # Initialize session state
17
+ if 'current_stage' not in st.session_state:
18
+ st.session_state.current_stage = 1
19
+ if 'workflow' not in st.session_state:
20
+ st.session_state.workflow = None
21
+ if 'ai_assistant' not in st.session_state:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  st.session_state.ai_assistant = AIAssistant()
23
 
24
+ # File upload
25
+ uploaded_file = st.file_uploader("Upload Dataset", type=['csv', 'xlsx'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  if uploaded_file is not None:
28
+ try:
29
+ # Load data
30
+ df = load_data(uploaded_file)
31
+ st.success(f"✅ Dataset loaded! Shape: {df.shape}")
32
+
33
+ # Initialize workflow
34
+ if st.session_state.workflow is None:
35
+ st.session_state.workflow = DataAnalysisWorkflow(df)
36
+
37
+ # Progress sidebar
38
+ st.sidebar.header("Progress")
39
+ progress = st.sidebar.progress(st.session_state.current_stage / 5)
40
+
41
+ stages = ["Data Overview", "Exploration", "Quality Check", "Analysis", "Summary"]
42
+ for i, stage in enumerate(stages, 1):
43
+ if i == st.session_state.current_stage:
44
+ st.sidebar.write(f"🔄 **{i}. {stage}**")
45
+ elif i < st.session_state.current_stage:
46
+ st.sidebar.write(f"✅ {i}. {stage}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  else:
48
+ st.sidebar.write(f" {i}. {stage}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Navigation
51
+ col1, col2 = st.sidebar.columns(2)
 
 
 
 
 
 
 
 
 
52
  with col1:
53
+ if st.button(" Previous") and st.session_state.current_stage > 1:
54
+ st.session_state.current_stage -= 1
55
+ st.rerun()
56
  with col2:
57
+ if st.button("Next ") and st.session_state.current_stage < 5:
58
+ st.session_state.current_stage += 1
59
+ st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ # Recent insights
62
+ st.sidebar.header("💡 Recent Insights")
63
+ recent_insights = st.session_state.workflow.insights[-3:]
64
+ for insight in recent_insights:
65
+ st.sidebar.info(f"**Stage {insight['stage']}:** {insight['insight']}")
66
 
67
+ # Main content with AI assistant
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  main_col, ai_col = st.columns([3, 1])
69
 
70
  with main_col:
71
+ # Execute current stage
72
+ if st.session_state.current_stage == 1:
73
+ st.session_state.workflow.stage_1_overview()
74
+ elif st.session_state.current_stage == 2:
75
+ st.session_state.workflow.stage_2_exploration()
76
+ elif st.session_state.current_stage == 3:
77
+ st.session_state.workflow.stage_3_cleaning()
78
+ elif st.session_state.current_stage == 4:
79
+ st.session_state.workflow.stage_4_analysis()
80
+ elif st.session_state.current_stage == 5:
81
+ st.session_state.workflow.stage_5_summary()
82
 
83
  with ai_col:
84
+ st.subheader("🤖 AI Assistant")
85
+
86
+ # AI model selection
87
+ available_models = st.session_state.ai_assistant.get_available_models()
88
+
89
+ if available_models:
90
+ selected_model = st.selectbox("AI Model:", available_models)
91
+
92
+ if st.button("Get AI Insights"):
93
+ if st.session_state.workflow.insights:
94
+ with st.spinner("Analyzing with AI..."):
95
+ ai_analysis = st.session_state.ai_assistant.analyze_insights(
96
+ df, st.session_state.workflow.insights, selected_model
97
+ )
98
+ st.write("**AI Analysis:**")
99
+ st.write(ai_analysis)
100
+ else:
101
+ st.warning("Complete some analysis stages first.")
102
+ else:
103
+ st.warning("No AI models available.")
104
+ st.info("Set GOOGLE_API_KEY or OPENAI_API_KEY environment variables.")
105
+
106
+ # Quick insights
107
+ st.subheader("📊 Quick Stats")
108
+ if st.session_state.workflow.insights:
109
+ st.metric("Total Insights", len(st.session_state.workflow.insights))
110
+ st.metric("Current Stage", f"{st.session_state.current_stage}/5")
111
+
112
+ # Latest insight
113
+ if st.session_state.workflow.insights:
114
+ latest = st.session_state.workflow.insights[-1]
115
+ st.info(f"**Latest:** {latest['insight']}")
116
+
117
+ # Data quality indicator
118
+ quality_score = 100
119
+ if st.session_state.workflow.stats['missing_values'] > 0:
120
+ quality_score -= 30
121
+ if st.session_state.workflow.stats['duplicates'] > 0:
122
+ quality_score -= 20
123
+
124
+ st.metric("Data Quality", f"{quality_score}%")
125
 
126
+ except Exception as e:
127
+ st.error(f"Error: {str(e)}")
128
+ st.info("Please check your file format and try again.")
 
129
 
130
  if __name__ == "__main__":
131
  main()