pavanmutha commited on
Commit
94ecd73
·
verified ·
1 Parent(s): 599065f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +184 -186
app.py CHANGED
@@ -23,12 +23,24 @@ from sklearn.feature_selection import SelectKBest, f_classif
23
 
24
  # Authenticate Hugging Face
25
  hf_token = os.getenv("HF_TOKEN")
26
- login(token=hf_token, add_to_git_credential=True)
 
27
 
28
  # Initialize Model
29
  model = HfApiModel("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
30
 
31
- # Cache for explanations (last 10 explanations)
 
 
 
 
 
 
 
 
 
 
 
32
  @lru_cache(maxsize=10)
33
  def cached_generate_lime_explanation(insight_text: str, class_names: tuple = ("Negative", "Positive")):
34
  """Generate and cache LIME explanations to improve performance"""
@@ -69,112 +81,96 @@ def cached_generate_lime_explanation(insight_text: str, class_names: tuple = ("N
69
 
70
  def generate_shap_explanation(model, X_train, X_test):
71
  """Generate SHAP explanations for model predictions"""
72
- explainer = shap.TreeExplainer(model)
73
- shap_values = explainer.shap_values(X_test)
74
-
75
- # Save SHAP plots
76
- shap_figures = []
77
- for plot_type in ['summary', 'bar', 'waterfall']:
78
- plt.figure()
79
- if plot_type == 'summary':
80
- shap.summary_plot(shap_values, X_test, plot_size=(10, 8), show=False)
81
- elif plot_type == 'bar':
82
- shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
83
- elif plot_type == 'waterfall':
84
- shap.plots._waterfall.waterfall_legacy(explainer.expected_value[0],
85
- shap_values[0][0],
86
- feature_names=X_test.columns, show=False)
87
 
88
- fig_path = f'./figures/shap_{plot_type}.png'
89
- plt.savefig(fig_path, bbox_inches='tight')
90
- plt.close()
91
- shap_figures.append(fig_path)
92
-
93
- return shap_figures
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def feature_engineering_experiments(X_train, X_test, y_train, y_test):
96
  """Run different feature engineering approaches and compare results"""
97
  results = {}
98
 
99
- # Original features baseline
100
- base_model = RandomForestClassifier(random_state=42)
101
- base_model.fit(X_train, y_train)
102
- y_pred = base_model.predict(X_test)
103
- results['baseline'] = {
104
- 'accuracy': accuracy_score(y_test, y_pred),
105
- 'precision': precision_score(y_test, y_pred, average='weighted'),
106
- 'recall': recall_score(y_test, y_pred, average='weighted'),
107
- 'f1': f1_score(y_test, y_pred, average='weighted')
108
- }
109
-
110
- # Standardized features
111
- scaler = StandardScaler()
112
- X_train_scaled = scaler.fit_transform(X_train)
113
- X_test_scaled = scaler.transform(X_test)
114
-
115
- scaled_model = RandomForestClassifier(random_state=42)
116
- scaled_model.fit(X_train_scaled, y_train)
117
- y_pred = scaled_model.predict(X_test_scaled)
118
- results['scaled'] = {
119
- 'accuracy': accuracy_score(y_test, y_pred),
120
- 'precision': precision_score(y_test, y_pred, average='weighted'),
121
- 'recall': recall_score(y_test, y_pred, average='weighted'),
122
- 'f1': f1_score(y_test, y_pred, average='weighted')
123
- }
124
-
125
- # Polynomial features
126
- poly = PolynomialFeatures(degree=2, interaction_only=True)
127
- X_train_poly = poly.fit_transform(X_train)
128
- X_test_poly = poly.transform(X_test)
129
-
130
- poly_model = RandomForestClassifier(random_state=42)
131
- poly_model.fit(X_train_poly, y_train)
132
- y_pred = poly_model.predict(X_test_poly)
133
- results['polynomial'] = {
134
- 'accuracy': accuracy_score(y_test, y_pred),
135
- 'precision': precision_score(y_test, y_pred, average='weighted'),
136
- 'recall': recall_score(y_test, y_pred, average='weighted'),
137
- 'f1': f1_score(y_test, y_pred, average='weighted')
138
- }
139
-
140
- # Feature selection
141
- selector = SelectKBest(f_classif, k=5)
142
- X_train_selected = selector.fit_transform(X_train, y_train)
143
- X_test_selected = selector.transform(X_test)
144
-
145
- selected_model = RandomForestClassifier(random_state=42)
146
- selected_model.fit(X_train_selected, y_train)
147
- y_pred = selected_model.predict(X_test_selected)
148
- results['selected'] = {
149
- 'accuracy': accuracy_score(y_test, y_pred),
150
- 'precision': precision_score(y_test, y_pred, average='weighted'),
151
- 'recall': recall_score(y_test, y_pred, average='weighted'),
152
- 'f1': f1_score(y_test, y_pred, average='weighted')
153
- }
154
-
155
- return results
156
-
157
- def format_analysis_report(raw_output, visuals):
158
  try:
159
- analysis_dict = raw_output if isinstance(raw_output, dict) else ast.literal_eval(str(raw_output))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- report = f"""
162
- <div style="font-family: Arial, sans-serif; padding: 20px; color: #333;">
163
- <h1 style="color: #2B547E; border-bottom: 2px solid #2B547E; padding-bottom: 10px;">📊 Data Analysis Report</h1>
164
- <div style="margin-top: 25px; background: #f8f9fa; padding: 20px; border-radius: 8px;">
165
- <h2 style="color: #2B547E;">🔍 Key Observations</h2>
166
- {format_observations(analysis_dict.get('observations', {}))}
167
- </div>
168
- <div style="margin-top: 30px;">
169
- <h2 style="color: #2B547E;">💡 Insights & Visualizations</h2>
170
- {format_insights(analysis_dict.get('insights', {}), visuals)}
171
- </div>
172
- </div>
173
- """
174
- return report, visuals, list(analysis_dict.get('insights', {})).values()
175
  except Exception as e:
176
- print(f"Error formatting report: {e}")
177
- return raw_output, visuals, []
 
178
 
179
  def analyze_data(csv_file, additional_notes=""):
180
  start_time = time.time()
@@ -185,102 +181,104 @@ def analyze_data(csv_file, additional_notes=""):
185
  shutil.rmtree('./figures')
186
  os.makedirs('./figures', exist_ok=True)
187
 
188
- wandb.login(key=os.environ.get('WANDB_API_KEY'))
189
- run = wandb.init(project="huggingface-data-analysis", config={
190
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
191
- "additional_notes": additional_notes,
192
- "source_file": csv_file.name if csv_file else None
193
- })
194
-
195
- # Load and preprocess data
196
- data = pd.read_csv(csv_file.name)
197
- X = data.drop('target', axis=1) # Assuming 'target' is the label column
198
- y = data['target']
199
-
200
- # Split data
201
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
202
-
203
- # Feature engineering experiments
204
- feat_eng_results = feature_engineering_experiments(X_train, X_test, y_train, y_test)
205
- wandb.log({"feature_engineering": feat_eng_results})
206
-
207
- # Train final model with best approach (using baseline here for demo)
208
- final_model = RandomForestClassifier(random_state=42)
209
- final_model.fit(X_train, y_train)
210
-
211
- # Generate SHAP explanations
212
- shap_figs = generate_shap_explanation(final_model, X_train, X_test)
213
-
214
- agent = CodeAgent(tools=[], model=model, additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn"])
215
- analysis_result = agent.run("""
216
- You are an expert data analyst. Perform comprehensive analysis including:
217
- 1. Basic statistics and data quality checks
218
- 2. Feature engineering experiment results
219
- 3. 3 insightful analytical questions about relationships in the data
220
- 4. Visualization of key patterns and correlations
221
- 5. Actionable real-world insights derived from findings
222
- Generate publication-quality visualizations and save to './figures/'
223
- """, additional_args={"additional_notes": additional_notes, "source_file": csv_file})
224
-
225
- execution_time = time.time() - start_time
226
- final_memory = process.memory_info().rss / 1024 ** 2
227
- memory_usage = final_memory - initial_memory
228
- wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
229
-
230
- visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.endswith(('.png', '.jpg', '.jpeg'))]
231
- visuals.extend(shap_figs) # Add SHAP visualizations
232
-
233
- for viz in visuals:
234
- wandb.log({os.path.basename(viz): wandb.Image(viz)})
235
 
236
- run.finish()
237
- return format_analysis_report(analysis_result, visuals)
238
-
239
- def objective(trial, X_train, y_train, X_test, y_test):
240
- """Objective function for hyperparameter optimization"""
241
- params = {
242
- 'n_estimators': trial.suggest_int('n_estimators', 50, 500),
243
- 'max_depth': trial.suggest_int('max_depth', 3, 20),
244
- 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
245
- 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
246
- 'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
247
- 'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
248
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
- model = RandomForestClassifier(**params, random_state=42)
251
- model.fit(X_train, y_train)
252
- y_pred = model.predict(X_test)
253
- return f1_score(y_test, y_pred, average='weighted')
254
 
255
  def tune_hyperparameters(csv_file, n_trials: int):
256
  """Run hyperparameter optimization with Optuna"""
257
- data = pd.read_csv(csv_file.name)
258
- X = data.drop('target', axis=1)
259
- y = data['target']
260
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
261
-
262
- study = optuna.create_study(direction="maximize")
263
- study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=n_trials)
264
-
265
- # Train final model with best params
266
- best_model = RandomForestClassifier(**study.best_params, random_state=42)
267
- best_model.fit(X_train, y_train)
268
- y_pred = best_model.predict(X_test)
269
-
270
- metrics = {
271
- 'accuracy': accuracy_score(y_test, y_pred),
272
- 'precision': precision_score(y_test, y_pred, average='weighted'),
273
- 'recall': recall_score(y_test, y_pred, average='weighted'),
274
- 'f1': f1_score(y_test, y_pred, average='weighted')
275
- }
276
-
277
- return f"Best Hyperparameters: {study.best_params}\n\nValidation Metrics:\n{metrics}"
 
 
 
 
278
 
279
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
280
  gr.Markdown("## 📊 AI Data Analysis Agent with Explainability")
281
 
282
  insights_store = gr.State([])
283
- data_store = gr.State(None) # Store loaded data
284
 
285
  with gr.Row():
286
  with gr.Column():
@@ -339,4 +337,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
339
  outputs=[optuna_output]
340
  )
341
 
342
- demo.launch(debug=True)
 
23
 
24
  # Authenticate Hugging Face
25
  hf_token = os.getenv("HF_TOKEN")
26
+ if hf_token:
27
+ login(token=hf_token, add_to_git_credential=True)
28
 
29
  # Initialize Model
30
  model = HfApiModel("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
31
 
32
+ def detect_target_column(df):
33
+ """Try to automatically detect the target column"""
34
+ # Common target column names
35
+ possible_targets = ['target', 'label', 'class', 'y', 'outcome', 'result']
36
+
37
+ for col in possible_targets:
38
+ if col in df.columns:
39
+ return col
40
+
41
+ # If none found, return the last column by default
42
+ return df.columns[-1]
43
+
44
  @lru_cache(maxsize=10)
45
  def cached_generate_lime_explanation(insight_text: str, class_names: tuple = ("Negative", "Positive")):
46
  """Generate and cache LIME explanations to improve performance"""
 
81
 
82
  def generate_shap_explanation(model, X_train, X_test):
83
  """Generate SHAP explanations for model predictions"""
84
+ try:
85
+ explainer = shap.TreeExplainer(model)
86
+ shap_values = explainer.shap_values(X_test)
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ # Save SHAP plots
89
+ shap_figures = []
90
+ for plot_type in ['summary', 'bar']:
91
+ plt.figure()
92
+ if plot_type == 'summary':
93
+ shap.summary_plot(shap_values, X_test, plot_size=(10, 8), show=False)
94
+ elif plot_type == 'bar':
95
+ shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
96
+
97
+ fig_path = f'./figures/shap_{plot_type}.png'
98
+ plt.savefig(fig_path, bbox_inches='tight')
99
+ plt.close()
100
+ shap_figures.append(fig_path)
101
+
102
+ return shap_figures
103
+ except Exception as e:
104
+ print(f"Error generating SHAP explanation: {e}")
105
+ return []
106
 
107
  def feature_engineering_experiments(X_train, X_test, y_train, y_test):
108
  """Run different feature engineering approaches and compare results"""
109
  results = {}
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  try:
112
+ # Original features baseline
113
+ base_model = RandomForestClassifier(random_state=42)
114
+ base_model.fit(X_train, y_train)
115
+ y_pred = base_model.predict(X_test)
116
+ results['baseline'] = {
117
+ 'accuracy': accuracy_score(y_test, y_pred),
118
+ 'precision': precision_score(y_test, y_pred, average='weighted'),
119
+ 'recall': recall_score(y_test, y_pred, average='weighted'),
120
+ 'f1': f1_score(y_test, y_pred, average='weighted')
121
+ }
122
+
123
+ # Standardized features
124
+ scaler = StandardScaler()
125
+ X_train_scaled = scaler.fit_transform(X_train)
126
+ X_test_scaled = scaler.transform(X_test)
127
+
128
+ scaled_model = RandomForestClassifier(random_state=42)
129
+ scaled_model.fit(X_train_scaled, y_train)
130
+ y_pred = scaled_model.predict(X_test_scaled)
131
+ results['scaled'] = {
132
+ 'accuracy': accuracy_score(y_test, y_pred),
133
+ 'precision': precision_score(y_test, y_pred, average='weighted'),
134
+ 'recall': recall_score(y_test, y_pred, average='weighted'),
135
+ 'f1': f1_score(y_test, y_pred, average='weighted')
136
+ }
137
+
138
+ # Polynomial features (only if few features)
139
+ if X_train.shape[1] < 10:
140
+ poly = PolynomialFeatures(degree=2, interaction_only=True)
141
+ X_train_poly = poly.fit_transform(X_train)
142
+ X_test_poly = poly.transform(X_test)
143
+
144
+ poly_model = RandomForestClassifier(random_state=42)
145
+ poly_model.fit(X_train_poly, y_train)
146
+ y_pred = poly_model.predict(X_test_poly)
147
+ results['polynomial'] = {
148
+ 'accuracy': accuracy_score(y_test, y_pred),
149
+ 'precision': precision_score(y_test, y_pred, average='weighted'),
150
+ 'recall': recall_score(y_test, y_pred, average='weighted'),
151
+ 'f1': f1_score(y_test, y_pred, average='weighted')
152
+ }
153
+
154
+ # Feature selection
155
+ k = min(5, X_train.shape[1])
156
+ selector = SelectKBest(f_classif, k=k)
157
+ X_train_selected = selector.fit_transform(X_train, y_train)
158
+ X_test_selected = selector.transform(X_test)
159
+
160
+ selected_model = RandomForestClassifier(random_state=42)
161
+ selected_model.fit(X_train_selected, y_train)
162
+ y_pred = selected_model.predict(X_test_selected)
163
+ results['selected'] = {
164
+ 'accuracy': accuracy_score(y_test, y_pred),
165
+ 'precision': precision_score(y_test, y_pred, average='weighted'),
166
+ 'recall': recall_score(y_test, y_pred, average='weighted'),
167
+ 'f1': f1_score(y_test, y_pred, average='weighted')
168
+ }
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  except Exception as e:
171
+ print(f"Error in feature engineering experiments: {e}")
172
+
173
+ return results
174
 
175
  def analyze_data(csv_file, additional_notes=""):
176
  start_time = time.time()
 
181
  shutil.rmtree('./figures')
182
  os.makedirs('./figures', exist_ok=True)
183
 
184
+ try:
185
+ wandb.login(key=os.environ.get('WANDB_API_KEY'))
186
+ run = wandb.init(project="huggingface-data-analysis", config={
187
+ "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
188
+ "additional_notes": additional_notes,
189
+ "source_file": csv_file.name if csv_file else None
190
+ })
191
+ except:
192
+ run = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ try:
195
+ # Load and preprocess data
196
+ data = pd.read_csv(csv_file.name)
197
+ target_col = detect_target_column(data)
198
+ X = data.drop(target_col, axis=1)
199
+ y = data[target_col]
200
+
201
+ # Split data
202
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
203
+
204
+ # Feature engineering experiments
205
+ feat_eng_results = feature_engineering_experiments(X_train, X_test, y_train, y_test)
206
+ if run:
207
+ wandb.log({"feature_engineering": feat_eng_results})
208
+
209
+ # Train final model with best approach (using baseline here for demo)
210
+ final_model = RandomForestClassifier(random_state=42)
211
+ final_model.fit(X_train, y_train)
212
+
213
+ # Generate SHAP explanations
214
+ shap_figs = generate_shap_explanation(final_model, X_train, X_test)
215
+
216
+ agent = CodeAgent(tools=[], model=model, additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn"])
217
+ analysis_result = agent.run(f"""
218
+ You are an expert data analyst. Perform comprehensive analysis including:
219
+ 1. Basic statistics and data quality checks
220
+ 2. Feature engineering experiment results: {feat_eng_results}
221
+ 3. Target column used: {target_col}
222
+ 4. 3 insightful analytical questions about relationships in the data
223
+ 5. Visualization of key patterns and correlations
224
+ 6. Actionable real-world insights derived from findings
225
+ Generate publication-quality visualizations and save to './figures/'
226
+ """, additional_args={"additional_notes": additional_notes, "source_file": csv_file})
227
+
228
+ execution_time = time.time() - start_time
229
+ final_memory = process.memory_info().rss / 1024 ** 2
230
+ memory_usage = final_memory - initial_memory
231
+ if run:
232
+ wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
233
+
234
+ visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.endswith(('.png', '.jpg', '.jpeg'))]
235
+ visuals.extend(shap_figs) # Add SHAP visualizations
236
+
237
+ if run:
238
+ for viz in visuals:
239
+ wandb.log({os.path.basename(viz): wandb.Image(viz)})
240
+
241
+ if run:
242
+ run.finish()
243
+ return format_analysis_report(analysis_result, visuals)
244
 
245
+ except Exception as e:
246
+ if run:
247
+ run.finish()
248
+ return f"Error analyzing data: {str(e)}", [], []
249
 
250
  def tune_hyperparameters(csv_file, n_trials: int):
251
  """Run hyperparameter optimization with Optuna"""
252
+ try:
253
+ data = pd.read_csv(csv_file.name)
254
+ target_col = detect_target_column(data)
255
+ X = data.drop(target_col, axis=1)
256
+ y = data[target_col]
257
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
258
+
259
+ study = optuna.create_study(direction="maximize")
260
+ study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=n_trials)
261
+
262
+ # Train final model with best params
263
+ best_model = RandomForestClassifier(**study.best_params, random_state=42)
264
+ best_model.fit(X_train, y_train)
265
+ y_pred = best_model.predict(X_test)
266
+
267
+ metrics = {
268
+ 'accuracy': accuracy_score(y_test, y_pred),
269
+ 'precision': precision_score(y_test, y_pred, average='weighted'),
270
+ 'recall': recall_score(y_test, y_pred, average='weighted'),
271
+ 'f1': f1_score(y_test, y_pred, average='weighted')
272
+ }
273
+
274
+ return f"Best Hyperparameters: {study.best_params}\n\nValidation Metrics:\n{metrics}"
275
+ except Exception as e:
276
+ return f"Error tuning hyperparameters: {str(e)}"
277
 
278
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
279
  gr.Markdown("## 📊 AI Data Analysis Agent with Explainability")
280
 
281
  insights_store = gr.State([])
 
282
 
283
  with gr.Row():
284
  with gr.Column():
 
337
  outputs=[optuna_output]
338
  )
339
 
340
+ demo.launch(debug=True, share=True)