entropy25 commited on
Commit
f71de9c
Β·
verified Β·
1 Parent(s): 59db6f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -343
app.py CHANGED
@@ -1,363 +1,131 @@
1
- """
2
- Data Analysis Platform
3
- Copyright (c) 2025 JEAN YOUNG
4
- All rights reserved.
5
-
6
- This software is proprietary and confidential.
7
- Unauthorized copying, distribution, or use is prohibited.
8
- """
9
  import streamlit as st
10
  import pandas as pd
11
- import numpy as np
12
- import plotly.express as px
13
- import plotly.graph_objects as go
14
- from plotly.subplots import make_subplots
15
- import warnings
16
- from typing import Dict, List, Any, Optional
17
- warnings.filterwarnings('ignore')
18
-
19
- # Import custom modules
20
- from data_handler import (
21
- load_csv_with_encoding,
22
- load_excel_file,
23
- calculate_basic_stats,
24
- calculate_missing_data,
25
- calculate_correlation_matrix,
26
- get_column_types,
27
- clean_data
28
- )
29
- from analyzer import DataAnalyzer
30
-
31
- # Page configuration
32
- st.set_page_config(
33
- page_title="Enhanced Data Analysis Platform",
34
- page_icon="πŸ“Š",
35
- layout="wide",
36
- initial_sidebar_state="expanded"
37
- )
38
-
39
- # Custom CSS
40
- st.markdown("""
41
- <style>
42
- .main-header {
43
- font-size: 2.5rem;
44
- font-weight: bold;
45
- text-align: center;
46
- margin-bottom: 2rem;
47
- color: #1f77b4;
48
- }
49
- .metric-card {
50
- background-color: #f0f2f6;
51
- padding: 1rem;
52
- border-radius: 10px;
53
- border-left: 5px solid #1f77b4;
54
- }
55
- .success-message {
56
- padding: 1rem;
57
- border-radius: 5px;
58
- background-color: #d4edda;
59
- border: 1px solid #c3e6cb;
60
- color: #155724;
61
- }
62
- </style>
63
- """, unsafe_allow_html=True)
64
 
65
  def main():
66
- st.markdown('<h1 class="main-header">πŸ“Š Data Analysis Platform</h1>', unsafe_allow_html=True)
 
 
 
 
67
 
68
- # Sidebar configuration
69
- st.sidebar.title("πŸ”§ Configuration")
70
 
71
- # File upload section
72
- st.sidebar.subheader("πŸ“ Data Upload")
73
- uploaded_file = st.sidebar.file_uploader(
74
- "Choose your data file",
75
- type=['csv', 'xlsx', 'xls'],
76
- help="Upload CSV or Excel files (max 100MB)"
77
- )
 
 
 
78
 
79
- # Main content area
80
  if uploaded_file is not None:
81
  try:
82
- # File size check
83
- file_size = len(uploaded_file.getvalue()) / (1024**2) # MB
84
- if file_size > 100:
85
- st.error(f"⚠️ File too large: {file_size:.1f}MB. Maximum allowed: 100MB")
86
- return
87
-
88
  # Load data
89
- with st.spinner("πŸ“₯ Loading data..."):
90
- df = load_data_file(uploaded_file)
91
 
92
- if df is not None and not df.empty:
93
- st.success(f"βœ… Data loaded successfully! Shape: {df.shape}")
94
-
95
- # Initialize analyzer
96
- analyzer = DataAnalyzer(df)
97
-
98
- # Sidebar options
99
- st.sidebar.subheader("🎯 Analysis Options")
100
- analysis_steps = [
101
- "πŸ“Š Data Overview",
102
- "πŸ” Data Exploration",
103
- "🧹 Data Quality Check",
104
- "πŸ”¬ Advanced Analysis",
105
- "πŸ€– Machine Learning",
106
- "πŸ“ˆ Insights & Report"
107
- ]
108
-
109
- selected_step = st.sidebar.selectbox(
110
- "Select Analysis Step:",
111
- analysis_steps,
112
- index=0
113
- )
114
-
115
- # Display selected analysis
116
- display_analysis_step(analyzer, selected_step, df)
117
-
118
- else:
119
- st.error("❌ Failed to load data. Please check your file format.")
120
-
121
- except Exception as e:
122
- st.error(f"❌ Error processing file: {str(e)}")
123
- else:
124
- # Welcome screen
125
- display_welcome_screen()
126
-
127
- def load_data_file(uploaded_file) -> Optional[pd.DataFrame]:
128
- """Load uploaded file based on its extension"""
129
- try:
130
- file_extension = uploaded_file.name.split('.')[-1].lower()
131
- file_content = uploaded_file.getvalue()
132
-
133
- if file_extension == 'csv':
134
- return load_csv_with_encoding(file_content, uploaded_file.name)
135
- elif file_extension in ['xlsx', 'xls']:
136
- return load_excel_file(file_content, uploaded_file.name)
137
- else:
138
- st.error("❌ Unsupported file format. Please upload CSV or Excel files.")
139
- return None
140
 
141
- except Exception as e:
142
- st.error(f"❌ Error loading file: {str(e)}")
143
- return None
144
-
145
- def display_analysis_step(analyzer: DataAnalyzer, step: str, df: pd.DataFrame):
146
- """Display the selected analysis step"""
147
-
148
- if step == "πŸ“Š Data Overview":
149
- display_data_overview(analyzer, df)
150
- elif step == "πŸ” Data Exploration":
151
- display_data_exploration(analyzer, df)
152
- elif step == "🧹 Data Quality Check":
153
- display_data_quality(analyzer, df)
154
- elif step == "πŸ”¬ Advanced Analysis":
155
- display_advanced_analysis(analyzer, df)
156
- elif step == "πŸ€– Machine Learning":
157
- display_machine_learning(analyzer, df)
158
- elif step == "πŸ“ˆ Insights & Report":
159
- display_insights_report(analyzer, df)
160
-
161
- def display_data_overview(analyzer: DataAnalyzer, df: pd.DataFrame):
162
- """Display data overview section"""
163
- st.header("πŸ“Š Data Overview")
164
-
165
- # Basic statistics
166
- stats = calculate_basic_stats(df)
167
-
168
- # Display metrics
169
- col1, col2, col3, col4 = st.columns(4)
170
- with col1:
171
- st.metric("πŸ“ Rows", f"{stats['shape'][0]:,}")
172
- with col2:
173
- st.metric("πŸ“‹ Columns", f"{stats['shape'][1]:,}")
174
- with col3:
175
- st.metric("πŸ’Ύ Memory Usage", f"{stats['memory_usage']:.1f} MB")
176
- with col4:
177
- st.metric("βœ… Completeness", f"{stats['completeness']:.1f}%")
178
-
179
- # Data types
180
- col1, col2 = st.columns([1, 1])
181
-
182
- with col1:
183
- st.subheader("πŸ“Š Data Types")
184
- dtype_df = pd.DataFrame(list(stats['dtypes'].items()), columns=['Type', 'Count'])
185
- fig = px.pie(dtype_df, values='Count', names='Type', title="Column Data Types")
186
- st.plotly_chart(fig, use_container_width=True)
187
-
188
- with col2:
189
- st.subheader("πŸ” Data Sample")
190
- st.dataframe(df.head(10), use_container_width=True)
191
-
192
- def display_data_exploration(analyzer: DataAnalyzer, df: pd.DataFrame):
193
- """Display data exploration section"""
194
- st.header("πŸ” Data Exploration")
195
-
196
- # Column selection for exploration
197
- numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
198
- categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
199
-
200
- if numeric_cols:
201
- st.subheader("πŸ“ˆ Numeric Data Distribution")
202
- selected_numeric = st.selectbox("Select numeric column:", numeric_cols)
203
-
204
- col1, col2 = st.columns([2, 1])
205
- with col1:
206
- fig = px.histogram(df, x=selected_numeric, title=f"Distribution of {selected_numeric}")
207
- st.plotly_chart(fig, use_container_width=True)
208
-
209
- with col2:
210
- st.write("**Statistics:**")
211
- stats = df[selected_numeric].describe()
212
- st.dataframe(stats)
213
-
214
- if len(numeric_cols) >= 2:
215
- st.subheader("πŸ”— Correlation Analysis")
216
- corr_matrix = calculate_correlation_matrix(df)
217
- if not corr_matrix.empty:
218
- fig = px.imshow(corr_matrix, text_auto=True, aspect="auto",
219
- title="Correlation Matrix")
220
- st.plotly_chart(fig, use_container_width=True)
221
-
222
- def display_data_quality(analyzer: DataAnalyzer, df: pd.DataFrame):
223
- """Display data quality check section"""
224
- st.header("🧹 Data Quality Check")
225
-
226
- # Missing data analysis
227
- missing_df = calculate_missing_data(df)
228
-
229
- if not missing_df.empty:
230
- st.subheader("❓ Missing Data Analysis")
231
- st.dataframe(missing_df, use_container_width=True)
232
-
233
- # Missing data visualization
234
- fig = px.bar(missing_df, x='Column', y='Missing %',
235
- title="Missing Data by Column",
236
- color='Severity',
237
- color_discrete_map={
238
- 'Critical': '#dc3545',
239
- 'High': '#fd7e14',
240
- 'Medium': '#ffc107',
241
- 'Low': '#28a745'
242
- })
243
- st.plotly_chart(fig, use_container_width=True)
244
- else:
245
- st.success("βœ… No missing data found!")
246
-
247
- # Duplicate analysis
248
- duplicates = df.duplicated().sum()
249
- if duplicates > 0:
250
- st.warning(f"⚠️ Found {duplicates} duplicate rows")
251
- else:
252
- st.success("βœ… No duplicate rows found!")
253
-
254
- def display_advanced_analysis(analyzer: DataAnalyzer, df: pd.DataFrame):
255
- """Display advanced analysis section"""
256
- st.header("πŸ”¬ Advanced Analysis")
257
-
258
- numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
259
-
260
- if len(numeric_cols) >= 2:
261
- st.subheader("🎯 Scatter Plot Analysis")
262
-
263
- col1, col2 = st.columns(2)
264
- with col1:
265
- x_col = st.selectbox("Select X-axis:", numeric_cols, key="x_axis")
266
- with col2:
267
- y_col = st.selectbox("Select Y-axis:", numeric_cols, key="y_axis", index=1 if len(numeric_cols) > 1 else 0)
268
-
269
- if x_col != y_col:
270
- fig = px.scatter(df, x=x_col, y=y_col, title=f"{x_col} vs {y_col}")
271
- st.plotly_chart(fig, use_container_width=True)
272
-
273
- def display_machine_learning(analyzer: DataAnalyzer, df: pd.DataFrame):
274
- """Display machine learning section"""
275
- st.header("πŸ€– Machine Learning")
276
-
277
- numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
278
-
279
- if len(numeric_cols) < 2:
280
- st.warning("⚠️ Need at least 2 numeric columns for ML analysis")
281
- return
282
-
283
- st.subheader("🎯 Model Configuration")
284
- target_col = st.selectbox("Select target column:", numeric_cols)
285
-
286
- if st.button("πŸš€ Run ML Analysis"):
287
- with st.spinner("πŸ€– Training models..."):
288
- try:
289
- results = analyzer.run_ml_analysis(target_col)
290
-
291
- if results:
292
- st.success("βœ… ML Analysis completed!")
293
-
294
- # Display results
295
- for model_name, metrics in results.items():
296
- st.subheader(f"πŸ“Š {model_name}")
297
-
298
- col1, col2 = st.columns(2)
299
- with col1:
300
- for metric, value in metrics.items():
301
- if isinstance(value, (int, float)):
302
- st.metric(metric.replace('_', ' ').title(), f"{value:.4f}")
303
-
304
- else:
305
- st.error("❌ ML analysis failed")
306
-
307
- except Exception as e:
308
- st.error(f"❌ Error in ML analysis: {str(e)}")
309
-
310
- def display_insights_report(analyzer: DataAnalyzer, df: pd.DataFrame):
311
- """Display insights and report section"""
312
- st.header("πŸ“ˆ Insights & Report")
313
-
314
- # Generate comprehensive report
315
- with st.spinner("πŸ“ Generating insights..."):
316
- try:
317
- insights = analyzer.generate_insights()
318
 
319
- for section, content in insights.items():
320
- st.subheader(f"πŸ“Š {section.replace('_', ' ').title()}")
321
-
322
- if isinstance(content, dict):
323
- for key, value in content.items():
324
- st.write(f"**{key.replace('_', ' ').title()}:** {value}")
325
- elif isinstance(content, list):
326
- for item in content:
327
- st.write(f"β€’ {item}")
328
  else:
329
- st.write(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
- st.write("---")
 
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  except Exception as e:
334
- st.error(f"❌ Error generating insights: {str(e)}")
335
-
336
- def display_welcome_screen():
337
- """Display welcome screen when no file is uploaded"""
338
- st.markdown("""
339
- ## πŸš€ Welcome to Enhanced Data Analysis Platform
340
-
341
- **Features:**
342
- - πŸ“Š **Comprehensive Data Overview** - Get instant insights about your data
343
- - πŸ” **Interactive Exploration** - Visualize patterns and relationships
344
- - 🧹 **Data Quality Assessment** - Identify and address data issues
345
- - πŸ”¬ **Advanced Analytics** - Perform statistical analysis
346
- - πŸ€– **Machine Learning** - Automated model building and evaluation
347
- - πŸ“ˆ **Smart Insights** - AI-generated recommendations
348
-
349
- **Supported Formats:**
350
- - CSV files (.csv)
351
- - Excel files (.xlsx, .xls)
352
-
353
- **Getting Started:**
354
- 1. Upload your data file using the sidebar
355
- 2. Select analysis steps to explore your data
356
- 3. Generate insights and export results
357
-
358
- ---
359
- *Upload a file to begin your analysis journey!*
360
- """)
361
 
362
  if __name__ == "__main__":
363
  main()
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from data_handler import load_data
4
+ from analyzer import DataAnalysisWorkflow, AIAssistant
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def main():
7
+ st.set_page_config(
8
+ page_title="Data Analysis Platform",
9
+ page_icon="πŸ“Š",
10
+ layout="wide"
11
+ )
12
 
13
+ st.title("πŸ“Š Data Analysis Platform")
14
+ st.markdown("**Optimized workflow with caching and pagination**")
15
 
16
+ # Initialize session state
17
+ if 'current_stage' not in st.session_state:
18
+ st.session_state.current_stage = 1
19
+ if 'workflow' not in st.session_state:
20
+ st.session_state.workflow = None
21
+ if 'ai_assistant' not in st.session_state:
22
+ st.session_state.ai_assistant = AIAssistant()
23
+
24
+ # File upload
25
+ uploaded_file = st.file_uploader("Upload Dataset", type=['csv', 'xlsx'])
26
 
 
27
  if uploaded_file is not None:
28
  try:
 
 
 
 
 
 
29
  # Load data
30
+ df = load_data(uploaded_file)
31
+ st.success(f"βœ… Dataset loaded! Shape: {df.shape}")
32
 
33
+ # Initialize workflow
34
+ if st.session_state.workflow is None:
35
+ st.session_state.workflow = DataAnalysisWorkflow(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # Progress sidebar
38
+ st.sidebar.header("Progress")
39
+ progress = st.sidebar.progress(st.session_state.current_stage / 5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ stages = ["Data Overview", "Exploration", "Quality Check", "Analysis", "Summary"]
42
+ for i, stage in enumerate(stages, 1):
43
+ if i == st.session_state.current_stage:
44
+ st.sidebar.write(f"πŸ”„ **{i}. {stage}**")
45
+ elif i < st.session_state.current_stage:
46
+ st.sidebar.write(f"βœ… {i}. {stage}")
 
 
 
47
  else:
48
+ st.sidebar.write(f"⏳ {i}. {stage}")
49
+
50
+ # Navigation
51
+ col1, col2 = st.sidebar.columns(2)
52
+ with col1:
53
+ if st.button("← Previous") and st.session_state.current_stage > 1:
54
+ st.session_state.current_stage -= 1
55
+ st.rerun()
56
+ with col2:
57
+ if st.button("Next β†’") and st.session_state.current_stage < 5:
58
+ st.session_state.current_stage += 1
59
+ st.rerun()
60
+
61
+ # Recent insights
62
+ st.sidebar.header("πŸ’‘ Recent Insights")
63
+ recent_insights = st.session_state.workflow.insights[-3:]
64
+ for insight in recent_insights:
65
+ st.sidebar.info(f"**Stage {insight['stage']}:** {insight['insight']}")
66
+
67
+ # Main content with AI assistant
68
+ main_col, ai_col = st.columns([3, 1])
69
+
70
+ with main_col:
71
+ # Execute current stage
72
+ if st.session_state.current_stage == 1:
73
+ st.session_state.workflow.stage_1_overview()
74
+ elif st.session_state.current_stage == 2:
75
+ st.session_state.workflow.stage_2_exploration()
76
+ elif st.session_state.current_stage == 3:
77
+ st.session_state.workflow.stage_3_cleaning()
78
+ elif st.session_state.current_stage == 4:
79
+ st.session_state.workflow.stage_4_analysis()
80
+ elif st.session_state.current_stage == 5:
81
+ st.session_state.workflow.stage_5_summary()
82
+
83
+ with ai_col:
84
+ st.subheader("πŸ€– AI Assistant")
85
 
86
+ # AI model selection
87
+ available_models = st.session_state.ai_assistant.get_available_models()
88
 
89
+ if available_models:
90
+ selected_model = st.selectbox("AI Model:", available_models)
91
+
92
+ if st.button("Get AI Insights"):
93
+ if st.session_state.workflow.insights:
94
+ with st.spinner("Analyzing with AI..."):
95
+ ai_analysis = st.session_state.ai_assistant.analyze_insights(
96
+ df, st.session_state.workflow.insights, selected_model
97
+ )
98
+ st.write("**AI Analysis:**")
99
+ st.write(ai_analysis)
100
+ else:
101
+ st.warning("Complete some analysis stages first.")
102
+ else:
103
+ st.warning("No AI models available.")
104
+ st.info("Set GOOGLE_API_KEY or OPENAI_API_KEY environment variables.")
105
+
106
+ # Quick insights
107
+ st.subheader("πŸ“Š Quick Stats")
108
+ if st.session_state.workflow.insights:
109
+ st.metric("Total Insights", len(st.session_state.workflow.insights))
110
+ st.metric("Current Stage", f"{st.session_state.current_stage}/5")
111
+
112
+ # Latest insight
113
+ if st.session_state.workflow.insights:
114
+ latest = st.session_state.workflow.insights[-1]
115
+ st.info(f"**Latest:** {latest['insight']}")
116
+
117
+ # Data quality indicator
118
+ quality_score = 100
119
+ if st.session_state.workflow.stats['missing_values'] > 0:
120
+ quality_score -= 30
121
+ if st.session_state.workflow.stats['duplicates'] > 0:
122
+ quality_score -= 20
123
+
124
+ st.metric("Data Quality", f"{quality_score}%")
125
+
126
  except Exception as e:
127
+ st.error(f"Error: {str(e)}")
128
+ st.info("Please check your file format and try again.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  if __name__ == "__main__":
131
  main()