Dhruv Pawar commited on
Commit
c8cfb06
Β·
0 Parent(s):

Initial commit to DATA-WHISPERER-PRO

Browse files
Files changed (5) hide show
  1. .gitignore +5 -0
  2. README.md +0 -0
  3. main.py +554 -0
  4. requirements.txt +19 -0
  5. test_gemini.py +23 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .env
4
+ *.h5
5
+ .vscode/
README.md ADDED
Binary file (44 Bytes). View file
 
main.py ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from plotly.subplots import make_subplots
7
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
10
+ from sklearn.decomposition import PCA
11
+ from sklearn.cluster import KMeans
12
+ import google.generativeai as genai
13
+ import os
14
+ from dotenv import load_dotenv
15
+ import json
16
+ import warnings
17
+ warnings.filterwarnings('ignore')
18
+
19
+ # Load environment variables
20
+ load_dotenv()
21
+ genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
22
+ model = genai.GenerativeModel('gemini-1.5-flash')
23
+
24
+ st.set_page_config("DataWhisperer Pro", "🎯", layout="wide")
25
+
26
+ # ------------------------------------------------------------------
27
+ # Helper: safe Gemini wrapper
28
+ # ------------------------------------------------------------------
29
+ def safe_gemini(prompt: str, fallback: str = "AI service unavailable") -> str:
30
+ try:
31
+ return model.generate_content(prompt).text
32
+ except Exception:
33
+ return fallback
34
+
35
+ # ------------------------------------------------------------------
36
+ @st.cache_data
37
+ def load_data(file):
38
+ return pd.read_csv(file)
39
+
40
+ def generate_data_story(df, insights):
41
+ return safe_gemini(
42
+ f"""
43
+ Create a brief, professional data story (max 100 words) based on:
44
+ Dataset shape: {df.shape}
45
+ Columns: {list(df.columns)[:5]}...
46
+ Key insights: {insights[:2]}
47
+
48
+ Write as a data analyst presenting findings. Be specific and actionable.
49
+ """,
50
+ "Your data reveals interesting patterns worth exploring further."
51
+ )
52
+
53
+ def get_analysis_recommendations(df):
54
+ txt = safe_gemini(
55
+ f"""
56
+ Suggest 3 specific analyses for a dataset with:
57
+ {len(df.select_dtypes(include=[np.number]).columns)} numeric columns
58
+ {len(df.select_dtypes(include=['object']).columns)} categorical columns
59
+
60
+ Format: Brief actionable recommendations only. No explanations.
61
+ """,
62
+ "Correlation analysis\nDistribution profiling\nOutlier investigation"
63
+ )
64
+ return [line for line in txt.split('\n') if line.strip()][:3]
65
+
66
+ def generate_smart_features(df):
67
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
68
+ txt = safe_gemini(
69
+ f"""
70
+ Suggest 2 simple feature engineering ideas for:
71
+ Columns: {numeric_cols[:3]}
72
+
73
+ Format: Column_name: transformation
74
+ Keep it simple and practical.
75
+ """,
76
+ "Consider log transformation for skewed distributions\nCreate interaction features"
77
+ )
78
+ return [s.strip() for s in txt.split('\n') if s.strip()][:2]
79
+
80
+ def anomaly_explanation(df, col, anomalies):
81
+ return safe_gemini(
82
+ f"""
83
+ In one sentence, explain why {len(anomalies)} anomalies were detected in '{col}'
84
+ (mean: {df[col].mean():.2f}, std: {df[col].std():.2f}).
85
+ Be technical but concise.
86
+ """,
87
+ f"Detected {len(anomalies)} values beyond expected range."
88
+ )
89
+
90
+ def generate_executive_summary(df, ml_results=None):
91
+ summary = dict(
92
+ rows=len(df),
93
+ columns=len(df.columns),
94
+ missing=df.isnull().sum().sum(),
95
+ numeric_cols=len(df.select_dtypes(include=[np.number]).columns),
96
+ )
97
+ if ml_results:
98
+ summary['ml_score'] = ml_results['score']
99
+ summary['top_feature'] = ml_results['features'].iloc[0]['feature']
100
+
101
+ return safe_gemini(
102
+ f"""
103
+ Write a 2-sentence executive summary for:
104
+ - Dataset: {summary['rows']} rows, {summary['columns']} columns
105
+ - Quality: {summary['missing']} missing values
106
+ - ML Performance: {summary.get('ml_score', 'N/A')}
107
+
108
+ Be direct and highlight the most important finding.
109
+ """,
110
+ "Dataset analysis complete. Key patterns identified for strategic decision-making."
111
+ )
112
+
113
+ def generate_ai_insights(df):
114
+ insights = []
115
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
116
+
117
+ # volatility
118
+ for col in numeric_cols[:3]:
119
+ mean = df[col].mean()
120
+ std = df[col].std()
121
+ cv = std / mean if mean else 0
122
+ if cv > 0.5:
123
+ insights.append(f"🎯 High volatility in {col} (CV: {cv:.2f})")
124
+
125
+ # correlations
126
+ if len(numeric_cols) > 1:
127
+ corr = df[numeric_cols].corr()
128
+ mask = (corr.abs() > 0.7) & (corr.abs() < 1)
129
+ pairs = np.column_stack(np.where(mask))
130
+ for r, c in pairs:
131
+ if r < c:
132
+ insights.append(f"πŸ”— Strong correlation: {numeric_cols[r]} ↔ {numeric_cols[c]}")
133
+
134
+ # quality
135
+ missing_ratio = df.isnull().sum().sum() / (len(df) * len(df.columns))
136
+ quality = (1 - missing_ratio) * 100
137
+ insights.append(f"πŸ’Ž Data Quality: {quality:.1f}%")
138
+
139
+ recs = get_analysis_recommendations(df)
140
+ if recs:
141
+ insights.append(f"πŸ€– AI suggests: {recs[0]}")
142
+
143
+ return insights[:5]
144
+
145
+ def create_comprehensive_eda(df):
146
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
147
+ categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
148
+ figures = {}
149
+
150
+ # 1. Correlation
151
+ # 1. Correlation heat-map
152
+ if len(numeric_cols) >= 2:
153
+ corr = df[numeric_cols].astype(float).corr()
154
+ mask = pd.DataFrame(
155
+ np.triu(np.ones_like(corr, dtype=bool), k=1),
156
+ index=corr.index,
157
+ columns=corr.columns
158
+ )
159
+ fig = go.Figure(go.Heatmap(
160
+ z=corr.mask(mask),
161
+ x=corr.columns,
162
+ y=corr.columns,
163
+ colorscale='RdBu',
164
+ zmid=0,
165
+ text=np.round(corr.values, 2),
166
+ texttemplate='%{text}',
167
+ textfont={"size": 10},
168
+ hoverongaps=False
169
+ ))
170
+ fig.update_layout(title="πŸ”₯ Correlation Matrix", height=500)
171
+ figures['correlation'] = fig
172
+
173
+ # 2. Distributions
174
+ if len(numeric_cols) > 0:
175
+ n_cols = min(len(numeric_cols), 6)
176
+ fig = make_subplots(rows=2, cols=3,
177
+ subplot_titles=[f"{c}" for c in numeric_cols[:n_cols]])
178
+ for i, col in enumerate(numeric_cols[:n_cols]):
179
+ fig.add_trace(
180
+ go.Histogram(x=df[col].dropna(), name=col, showlegend=False,
181
+ marker_color='rgba(55, 128, 191, 0.7)'),
182
+ row=(i // 3) + 1, col=(i % 3) + 1
183
+ )
184
+ fig.update_layout(title="πŸ“Š Distribution Analysis", height=600)
185
+ figures['distributions'] = fig
186
+
187
+ # 3. Box-plots
188
+ if len(numeric_cols) > 0:
189
+ fig = go.Figure()
190
+ for i, col in enumerate(numeric_cols[:min(8, len(numeric_cols))]):
191
+ Q1, Q3 = df[col].quantile([0.25, 0.75])
192
+ IQR = Q3 - Q1
193
+ outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)][col]
194
+ fig.add_trace(go.Box(
195
+ y=df[col], name=f"{col} ({len(outliers)} outliers)",
196
+ boxpoints='outliers',
197
+ marker_color=px.colors.qualitative.Set3[i % len(px.colors.qualitative.Set3)]
198
+ ))
199
+ fig.update_layout(title="πŸ“¦ Anomaly Detection System", height=400)
200
+ figures['boxplots'] = fig
201
+
202
+ # 4. Scatter matrix
203
+ if len(numeric_cols) >= 2:
204
+ cols = numeric_cols[:min(4, len(numeric_cols))]
205
+ fig = px.scatter_matrix(df[cols], dimensions=cols,
206
+ title="🎯 Multi-Dimensional Analysis", height=700)
207
+ fig.update_traces(diagonal_visible=False,
208
+ marker=dict(size=5, opacity=0.6))
209
+ figures['scatter_matrix'] = fig
210
+
211
+ # 5. PCA
212
+ if len(numeric_cols) >= 3:
213
+ scaler = StandardScaler()
214
+ scaled = scaler.fit_transform(df[numeric_cols].fillna(df[numeric_cols].mean()))
215
+ pca = PCA(n_components=2).fit_transform(scaled)
216
+ n_clusters = min(4, max(2, len(df) // 50))
217
+ clusters = KMeans(n_clusters=n_clusters, random_state=42).fit_predict(pca)
218
+ fig = px.scatter(x=pca[:, 0], y=pca[:, 1], color=clusters.astype(str),
219
+ title="🧬 AI Pattern Recognition",
220
+ labels={'x': f'PC1 ({PCA(2).fit(scaled).explained_variance_ratio_[0]:.1%})',
221
+ 'y': f'PC2 ({PCA(2).fit(scaled).explained_variance_ratio_[1]:.1%})'})
222
+ fig.update_layout(height=500)
223
+ figures['pca'] = fig
224
+
225
+ # 6. Trends
226
+ if len(df) > 20 and len(numeric_cols) > 0:
227
+ fig = go.Figure()
228
+ for col in numeric_cols[:3]:
229
+ ma = df[col].rolling(window=max(5, len(df) // 20)).mean()
230
+ fig.add_trace(go.Scatter(x=df.index, y=df[col], mode='lines',
231
+ name=col, opacity=0.6))
232
+ fig.add_trace(go.Scatter(x=df.index, y=ma, mode='lines',
233
+ name=f'{col} (Trend)', line=dict(width=3, dash='dash')))
234
+ fig.update_layout(title="πŸ“ˆ Trend Analysis", height=400, hovermode='x unified')
235
+ figures['trends'] = fig
236
+
237
+ # 7. Categorical bar
238
+ if categorical_cols:
239
+ cat_col = categorical_cols[0]
240
+ if df[cat_col].nunique() <= 20:
241
+ counts = df[cat_col].value_counts().head(10)
242
+ fig = px.bar(x=counts.index, y=counts.values,
243
+ title=f"🏷️ {cat_col} Distribution",
244
+ labels={'x': cat_col, 'y': 'Count'})
245
+ fig.update_traces(marker_color='lightblue',
246
+ marker_line_color='darkblue', marker_line_width=1.5)
247
+ fig.update_layout(height=400)
248
+ figures['categorical'] = fig
249
+
250
+ # 8. 3D scatter
251
+ if len(numeric_cols) >= 3:
252
+ fig = px.scatter_3d(df, x=numeric_cols[0], y=numeric_cols[1], z=numeric_cols[2],
253
+ color=df[numeric_cols[0]], title="🌐 3D Data Universe", height=600)
254
+ fig.update_traces(marker=dict(size=5, opacity=0.8))
255
+ figures['3d'] = fig
256
+
257
+ return figures
258
+
259
+ def quick_ml(df, target):
260
+ if target not in df.columns:
261
+ return None
262
+
263
+ X = df.drop(columns=[target])
264
+ y = df[target]
265
+
266
+ # categorical predictors
267
+ for col in X.select_dtypes(include=['object']):
268
+ X[col] = LabelEncoder().fit_transform(X[col].astype(str))
269
+ X = X.fillna(X.mean())
270
+
271
+ # target encoding
272
+ if y.dtype == 'object' or y.nunique() < 10:
273
+ y_enc = LabelEncoder().fit_transform(y.astype(str))
274
+ mdl = RandomForestClassifier(n_estimators=100, random_state=42)
275
+ task = "Classification"
276
+ else:
277
+ y_enc = y
278
+ mdl = RandomForestRegressor(n_estimators=100, random_state=42)
279
+ task = "Regression"
280
+
281
+ X_train, X_test, y_train, y_test = train_test_split(
282
+ X, y_enc, test_size=0.2, random_state=42)
283
+ mdl.fit(X_train, y_train)
284
+ score = mdl.score(X_test, y_test)
285
+
286
+ importance = pd.DataFrame({
287
+ 'feature': X.columns,
288
+ 'importance': mdl.feature_importances_
289
+ }).sort_values('importance', ascending=False).head(10)
290
+
291
+ insights = []
292
+ if score > 0.9:
293
+ insights.append("πŸ† Exceptional model performance achieved!")
294
+ elif score > 0.75:
295
+ insights.append("βœ… Strong predictive capability")
296
+ top_feat = importance.iloc[0]
297
+ insights.append(f"🎯 {top_feat['feature']} is the key driver ({top_feat['importance']*100:.1f}%)")
298
+ ai_tip = safe_gemini(
299
+ f"In 10 words, what business action does {score:.1%} {task.lower()} accuracy on {target} enable?",
300
+ "Use predictions to prioritize high-value opportunities"
301
+ )
302
+ insights.append(f"πŸ’‘ {ai_tip}")
303
+
304
+ return dict(score=score, task=task, features=importance, insights=insights)
305
+
306
+ # ------------------------------------------------------------------
307
+ # Streamlit UI
308
+ # ------------------------------------------------------------------
309
+ if 'df' not in st.session_state:
310
+ st.session_state.df = None
311
+ if 'ai_story' not in st.session_state:
312
+ st.session_state.ai_story = None
313
+
314
+ st.title("🎯 DataWhisperer Pro")
315
+ st.caption("AI-Powered Intelligence Platform with Gemini Integration")
316
+
317
+ with st.sidebar:
318
+ st.header("πŸ“ Data Control Center")
319
+ uploaded_file = st.file_uploader("Upload CSV", type="csv")
320
+ if uploaded_file:
321
+ st.session_state.df = load_data(uploaded_file)
322
+ st.success("βœ… Data loaded successfully!")
323
+ df = st.session_state.df
324
+ col1, col2 = st.columns(2)
325
+ col1.metric("Rows", f"{len(df):,}")
326
+ col2.metric("Columns", len(df.columns))
327
+
328
+ st.subheader("πŸ€– AI-Powered Insights")
329
+ insights = generate_ai_insights(df)
330
+ for insight in insights:
331
+ st.info(insight)
332
+
333
+ with st.spinner("🧠 Generating data narrative..."):
334
+ st.session_state.ai_story = generate_data_story(df, insights)
335
+
336
+ st.subheader("πŸ”§ AI Feature Suggestions")
337
+ suggestions = generate_smart_features(df)
338
+ for suggestion in suggestions:
339
+ st.code(suggestion, language='python')
340
+
341
+ if st.session_state.df is not None:
342
+ df = st.session_state.df
343
+ if st.session_state.ai_story:
344
+ st.markdown("### πŸ“– Your Data Story")
345
+ st.info(st.session_state.ai_story)
346
+
347
+ tab1, tab2, tab3, tab4 = st.tabs(["πŸ“Š Smart EDA", "πŸ“ˆ Custom Analysis", "πŸ€– AutoML", "πŸ§ͺ AI Lab"])
348
+
349
+ with tab1:
350
+ st.header("πŸ“Š Intelligent EDA Dashboard")
351
+ with st.spinner("🧠 AI analyzing patterns..."):
352
+ figures = create_comprehensive_eda(df)
353
+
354
+ st.subheader("πŸ’‘ Executive Metrics")
355
+ col1, col2, col3, col4 = st.columns(4)
356
+ with col1:
357
+ missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
358
+ st.metric("Data Quality", f"{100 - missing_pct:.1f}%",
359
+ "βœ… Good" if missing_pct < 5 else "⚠️ Review")
360
+ with col2:
361
+ st.metric("Numeric Features", len(df.select_dtypes(include=[np.number]).columns))
362
+ with col3:
363
+ st.metric("Categories", len(df.select_dtypes(include=['object']).columns))
364
+ with col4:
365
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
366
+ if len(numeric_cols) > 1:
367
+ corr = df[numeric_cols].corr()
368
+ high_corr = (corr.abs() > 0.7).sum().sum() - len(corr)
369
+ st.metric("Correlations", high_corr // 2)
370
+
371
+ for key, fig in figures.items():
372
+ st.plotly_chart(fig, use_container_width=True)
373
+
374
+ st.subheader("πŸ“Š Statistical Profile")
375
+ numeric_df = df.select_dtypes(include=[np.number])
376
+ if not numeric_df.empty:
377
+ st.dataframe(numeric_df.describe().round(2), use_container_width=True)
378
+
379
+ with tab2:
380
+ st.header("πŸ“ˆ Interactive Visualization Studio")
381
+ col1, col2 = st.columns([1, 2])
382
+ with col1:
383
+ viz_type = st.selectbox("Visualization Type",
384
+ ["Scatter", "Histogram", "Box", "Violin", "3D Scatter", "Bubble", "Heatmap"])
385
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
386
+ if viz_type in ["Histogram", "Box", "Violin"]:
387
+ x_col = st.selectbox("Select Column", numeric_cols)
388
+ y_col = None
389
+ elif viz_type in ["Scatter", "Bubble"]:
390
+ x_col = st.selectbox("X-axis", numeric_cols)
391
+ y_col = st.selectbox("Y-axis", numeric_cols, index=1 if len(numeric_cols) > 1 else 0)
392
+ if viz_type == "Bubble" and len(numeric_cols) > 2:
393
+ size_col = st.selectbox("Bubble Size", numeric_cols, index=2)
394
+ elif viz_type == "3D Scatter" and len(numeric_cols) >= 3:
395
+ x_col = st.selectbox("X-axis", numeric_cols)
396
+ y_col = st.selectbox("Y-axis", numeric_cols, index=1)
397
+ z_col = st.selectbox("Z-axis", numeric_cols, index=2)
398
+ else:
399
+ x_col = None
400
+ y_col = None
401
+ color_col = st.selectbox("Color by", ["None"] + list(df.columns))
402
+ if color_col == "None":
403
+ color_col = None
404
+ with col2:
405
+ fig = None
406
+ if viz_type == "Scatter" and x_col and y_col:
407
+ fig = px.scatter(df, x=x_col, y=y_col, color=color_col, title=f"{x_col} vs {y_col}")
408
+ elif viz_type == "Histogram" and x_col:
409
+ fig = px.histogram(df, x=x_col, marginal="rug", color=color_col, title=f"Distribution: {x_col}")
410
+ elif viz_type == "Box" and x_col:
411
+ fig = px.box(df, y=x_col, color=color_col, title=f"Box Plot: {x_col}")
412
+ elif viz_type == "Violin" and x_col:
413
+ fig = px.violin(df, y=x_col, box=True, color=color_col, title=f"Violin Plot: {x_col}")
414
+ elif viz_type == "3D Scatter" and 'z_col' in locals():
415
+ fig = px.scatter_3d(df, x=x_col, y=y_col, z=z_col, color=color_col, title="3D Visualization")
416
+ elif viz_type == "Bubble" and 'size_col' in locals():
417
+ fig = px.scatter(df, x=x_col, y=y_col, size=size_col, color=color_col,
418
+ title="Bubble Chart", size_max=60)
419
+ elif viz_type == "Heatmap" and len(numeric_cols) > 1:
420
+ fig = px.imshow(df[numeric_cols].corr(), text_auto=True, color_continuous_scale="Viridis")
421
+ if fig:
422
+ st.plotly_chart(fig, use_container_width=True)
423
+
424
+ with tab3:
425
+ st.header("πŸ€– Automated Machine Learning")
426
+ col1, col2 = st.columns([1, 2])
427
+ with col1:
428
+ target = st.selectbox("🎯 Target Variable", df.columns)
429
+ if st.button("πŸš€ Launch AutoML", type="primary"):
430
+ with st.spinner("🧠 Training AI models..."):
431
+ results = quick_ml(df, target)
432
+ if results:
433
+ st.success("βœ… Model Ready!")
434
+ st.metric("Performance Score", f"{results['score']:.3f}")
435
+ st.caption(f"*{results['task']} Model*")
436
+ for insight in results['insights']:
437
+ st.info(insight)
438
+ summary = generate_executive_summary(df, results)
439
+ st.markdown("**Executive Summary:**")
440
+ st.write(summary)
441
+ with col2:
442
+ if 'results' in locals() and results:
443
+ fig = px.bar(results['features'], x='importance', y='feature', orientation='h',
444
+ title="🎯 Feature Importance Analysis", color='importance',
445
+ color_continuous_scale='Blues')
446
+ st.plotly_chart(fig, use_container_width=True)
447
+
448
+ with tab4:
449
+ st.header("πŸ§ͺ AI Laboratory")
450
+ col1, col2 = st.columns(2)
451
+
452
+ with col1:
453
+ st.subheader("🎨 AI Data Insights")
454
+ user_query = st.text_area(
455
+ "Ask AI about your data:",
456
+ placeholder="e.g., What patterns should I investigate?",
457
+ height=100
458
+ )
459
+
460
+ if st.button("πŸ€– Ask AI"):
461
+ if user_query:
462
+ # Build a rich prompt
463
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
464
+ cat_cols = df.select_dtypes(include=['object']).columns.tolist()
465
+ prompt = f"""
466
+ Dataset snapshot:
467
+ - Shape: {df.shape}
468
+ - Numeric columns: {num_cols[:5]}{'...' if len(num_cols)>5 else ''}
469
+ - Categorical columns: {cat_cols[:5]}{'...' if len(cat_cols)>5 else ''}
470
+ - Missing values: {df.isnull().sum().sum()}
471
+ - First 3 rows as JSON: {json.dumps(df.head(3).to_dict(orient="records"))}
472
+
473
+ User question: {user_query}
474
+
475
+ Give a concise, actionable answer (max 80 words).
476
+ """
477
+
478
+ with st.spinner("Querying Gemini…"):
479
+ answer = safe_gemini(prompt, fallback="πŸ’‘ Tip: check your GEMINI_API_KEY or quota.")
480
+ st.success("AI Response:")
481
+ st.write(answer)
482
+
483
+ with col2:
484
+ st.subheader("πŸ”¬ Anomaly Detection")
485
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
486
+ if len(numeric_cols) > 0:
487
+ anomaly_col = st.selectbox("Select column for anomaly detection", numeric_cols)
488
+ if st.button("πŸ” Detect Anomalies"):
489
+ Q1, Q3 = df[anomaly_col].quantile([0.25, 0.75])
490
+ IQR = Q3 - Q1
491
+ anomalies = df[(df[anomaly_col] < Q1 - 1.5 * IQR) |
492
+ (df[anomaly_col] > Q3 + 1.5 * IQR)]
493
+
494
+ if len(anomalies) > 0:
495
+ st.warning(f"Found {len(anomalies)} anomalies!")
496
+ st.info(anomaly_explanation(df, anomaly_col, anomalies))
497
+ fig = go.Figure()
498
+ fig.add_trace(go.Scatter(
499
+ y=df[anomaly_col], mode='markers',
500
+ name='Normal', marker=dict(color='blue', size=5)))
501
+ fig.add_trace(go.Scatter(
502
+ y=anomalies[anomaly_col], x=anomalies.index,
503
+ mode='markers', name='Anomalies',
504
+ marker=dict(color='red', size=10)))
505
+ fig.update_layout(title=f"Anomalies in {anomaly_col}")
506
+ st.plotly_chart(fig, use_container_width=True)
507
+ else:
508
+ st.success("No significant anomalies detected!")
509
+ else:
510
+ st.markdown("""
511
+ ## πŸš€ Welcome to DataWhisperer Pro
512
+ ### *Powered by Google Gemini AI*
513
+
514
+ ---
515
+
516
+ ### 🌟 Why DataWhisperer Pro?
517
+
518
+ #### πŸ“Š **Intelligent EDA**
519
+ - AI-generated data narratives
520
+ - Pattern recognition with clustering
521
+ - Anomaly detection & explanation
522
+ - 8+ auto-generated visualizations
523
+ - 3D interactive exploration
524
+
525
+ #### πŸ€– **Gemini AI Integration**
526
+ - Natural language data queries
527
+ - Smart feature engineering suggestions
528
+ - Automated insight generation
529
+ - Executive summaries
530
+ - Predictive modeling guidance
531
+
532
+ #### ⚑ **Professional Features**
533
+ - Production-ready visualizations
534
+ - ML model evaluation
535
+ - Real-time AI assistance
536
+ - Export-ready reports
537
+
538
+ #### 🎯 **Built for Data Scientists**
539
+ - Clean, modular architecture
540
+ - Scalable design patterns
541
+ - Industry best practices
542
+ - Comprehensive documentation
543
+
544
+ ---
545
+
546
+ **πŸ‘ˆ Upload your CSV to unlock AI-powered insights!**
547
+ """)
548
+ col1, col2, col3, col4 = st.columns(4)
549
+ col1.metric("Visualizations", "8+", "Auto-generated")
550
+ col2.metric("AI Features", "6", "Gemini-powered")
551
+ col3.metric("ML Models", "2", "AutoML ready")
552
+ col4.metric("Processing", "<3s", "Lightning fast")
553
+ st.markdown("---")
554
+ st.caption("Built with ❀️ using Streamlit, Plotly, Scikit-learn, and Google Gemini AI")
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ streamlit==1.32.0
3
+ pandas==2.2.0
4
+ numpy==1.26.3
5
+
6
+ # Visualization
7
+ plotly==5.19.0
8
+
9
+ # Machine Learning
10
+ scikit-learn==1.4.0
11
+
12
+ # Google Gemini AI
13
+ google-generativeai==0.3.2
14
+
15
+ # Environment variables
16
+ python-dotenv==1.0.1
17
+
18
+ # Additional utilities
19
+ openpyxl==3.1.2 # For Excel file support
test_gemini.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import google.generativeai as genai
4
+
5
+ # 1. Load env
6
+ load_dotenv()
7
+ key = os.getenv("GEMINI_API_KEY")
8
+
9
+ # 2. Basic checks
10
+ print("GEMINI_API_KEY found:", bool(key))
11
+ if not key:
12
+ exit("❌ Key missing – fix .env file")
13
+
14
+ # 3. Configure SDK
15
+ genai.configure(api_key=key)
16
+
17
+ # 4. Quick call
18
+ try:
19
+ model = genai.GenerativeModel("gemini-1.5-flash")
20
+ resp = model.generate_content("Say 'OK' if you are alive.", generation_config={"max_output_tokens": 5})
21
+ print("βœ… Gemini OK – response:", resp.text.strip())
22
+ except Exception as e:
23
+ print("❌ Gemini error:", e)