salihfurkaan commited on
Commit
e68d049
·
1 Parent(s): 90ab796

Add HF Datasets support and dynamic UI improvements

Browse files
__pycache__/app.cpython-313.pyc CHANGED
Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ
 
app.py CHANGED
@@ -2,7 +2,7 @@
2
  import gradio as gr
3
  import pandas as pd
4
  import os
5
- from src.ingestion import load_file
6
  from src.profiling import profile_data, get_overview_text
7
  from src.cleaning import clean_data
8
  from src.anomalies import detect_anomalies
@@ -10,42 +10,45 @@ from src.visualization import generate_charts
10
  from src.llm import get_insights, get_followup_questions, ask_llm, get_ml_recommendations, analyze_text_content
11
 
12
  # Updated analyze_dataset to accept api_token
13
- def analyze_dataset(file_obj, api_token):
14
- if file_obj is None:
 
 
 
 
 
15
  return (
16
- "## Please upload a file to begin.",
17
  pd.DataFrame(),
18
- "",
19
- None,
20
- "",
21
  pd.DataFrame(),
22
  "",
23
- "", # Text Analysis
24
  None # For download file
25
  )
26
-
27
- # 1. Ingestion
28
- df, error, load_log = load_file(file_obj)
29
  if error:
30
- return f"## Error: {error}", pd.DataFrame(), "", None, "", pd.DataFrame(), "", "", None
31
-
32
  # 2. Profiling & Cleaning
33
  df_clean, cleaning_log = clean_data(df)
34
  profile = profile_data(df_clean)
35
  overview_text = get_overview_text(profile)
36
-
37
  # 3. Anomalies
38
  anomalies_df, anomaly_summary = detect_anomalies(df_clean)
39
-
40
  # 4. Visualization
41
  chart_figure = generate_charts(df_clean, profile)
42
-
43
  # 5. LLM Insights & Questions
44
  insights = get_insights(overview_text, anomaly_summary, api_token)
45
  ml_recommendations = get_ml_recommendations(overview_text, api_token)
46
-
47
  # 6. Text Analysis (New)
48
- text_analysis = ""
49
  # Check for 'Content' column from .txt ingestion or 'Review'/'Text' columns in CSV
50
  text_cols = [col for col in df_clean.columns if col.lower() in ['content', 'text', 'review', 'comments']]
51
  if text_cols:
@@ -54,18 +57,19 @@ def analyze_dataset(file_obj, api_token):
54
  # Get up to 50 lines/samples
55
  samples = df_clean[target_col].dropna().astype(str).tolist()
56
  if samples:
57
- text_analysis = analyze_text_content(samples, api_token)
58
-
 
59
  # Format Outputs
60
  overview_output = f"{overview_text}\n\n"
61
  if load_log:
62
  overview_output += f"{load_log}\n\n"
63
  overview_output += "**Data Cleaning Log:**\n" + "\n".join([f"- {item}" for item in cleaning_log])
64
-
65
  # Save cleaned data for download
66
  output_path = "cleaned_data.csv"
67
  df_clean.to_csv(output_path, index=False)
68
-
69
  return (
70
  overview_output, # Dataset Overview (Markdown)
71
  df_clean.head(), # Dataset Overview (DataFrame)
@@ -74,7 +78,7 @@ def analyze_dataset(file_obj, api_token):
74
  f"### Anomaly Detection Report\n{anomaly_summary}", # Anomalies Markdown
75
  anomalies_df, # Anomalies DataFrame
76
  ml_recommendations, # ML Recommendations
77
- text_analysis, # Text Analysis (New)
78
  output_path # Download File Path
79
  )
80
 
@@ -90,7 +94,7 @@ def load_example():
90
  df = pd.DataFrame(dummy_data)
91
  # Add some anomalies
92
  df.loc[6, "Salary"] = 1200000 # outlier
93
-
94
  df.to_csv("example_dataset.csv", index=False)
95
  return "example_dataset.csv"
96
 
@@ -100,8 +104,8 @@ def chat_response(message, history, overview_text, api_token):
100
  return ask_llm(message, history, overview_text, api_token)
101
 
102
  # Updated process function wrapper to match inputs/outputs
103
- def process_file_wrapper(file_obj, api_token):
104
- results = analyze_dataset(file_obj, api_token)
105
  return results + (results[0],) # Append overview_md for the state
106
 
107
  # --- Custom Styling & Theme ---
@@ -133,10 +137,22 @@ h1 {
133
  text-align: center;
134
  color: #94a3b8;
135
  font-size: 1.2rem;
136
- margin-bottom: 2.5rem;
137
  font-weight: 300;
138
  }
139
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  /* Sidebar Styling */
141
  .sidebar-content {
142
  background: linear-gradient(145deg, #1e293b, #0f172a);
@@ -211,27 +227,42 @@ theme = gr.themes.Soft(
211
  with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
212
  gr.Markdown("# ⚡ Auto Data Analyst")
213
  gr.Markdown("<div class='subtitle'>Instant AI Analysis • Professional Insights</div>")
214
-
 
 
 
 
 
 
 
 
215
  # State to hold the overview text for the chatbot
216
  overview_state = gr.State()
217
-
218
  with gr.Row():
219
  # Sidebar
220
  with gr.Column(scale=1, elem_classes="sidebar-content"):
221
  gr.Markdown("### 📂 Data Source")
222
- file_upload = gr.File(label="Upload Dataset", file_types=[".csv", ".xlsx", ".json", ".parquet", ".txt", ".zip"])
223
- example_btn = gr.Button("🎲 Load Sample Data", variant="secondary")
224
-
 
 
 
 
 
 
 
225
  gr.Markdown("---")
226
  gr.Markdown("### 🔐 Authentication")
227
  api_token_input = gr.Textbox(
228
- label="Hugging Face Token (Optional)",
229
  placeholder="hf_...",
230
  type="password",
231
  info="Paste your token for higher rate limits."
232
  )
233
  gr.Markdown("<small style='color: #64748b;'>Get a free token in your [HF Settings](https://huggingface.co/settings/tokens).</small>")
234
-
235
  # Main Content
236
  with gr.Column(scale=4):
237
  with gr.Tabs():
@@ -240,50 +271,68 @@ with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
240
  # Removed height from Dataframe to avoid Gradio error
241
  dataframe_view = gr.Dataframe(interactive=False, label="Data Preview")
242
  download_btn = gr.DownloadButton("⬇️ Download Cleaned CSV", label="Download CSV", variant="primary")
243
-
244
  with gr.TabItem("💡 Insights"):
245
  insights_md = gr.Markdown("AI Insights will appear here...")
246
-
247
  with gr.TabItem("📈 Visuals"):
248
  charts_plot = gr.Plot(label="Interactive Dashboard")
249
-
250
  with gr.TabItem("⚠️ Anomalies"):
251
  anomalies_md = gr.Markdown("Anomaly report...")
252
  anomalies_df_view = gr.Dataframe(interactive=False, label="Detected Anomalies")
253
-
254
  with gr.TabItem("🧠 ML Models"):
255
  ml_md = gr.Markdown("ML Recommendations will appear here.")
256
-
257
- with gr.TabItem("📝 Text Analysis"):
258
  text_analysis_md = gr.Markdown("Upload a .txt file or dataset with a 'Content/Review' column to see text analysis.")
259
-
260
  with gr.TabItem("💬 Assistant"):
261
  chatbot = gr.ChatInterface(
262
  fn=chat_response,
263
  additional_inputs=[overview_state, api_token_input]
264
  )
265
-
266
- # Event wiring
267
  file_upload.change(
268
- fn=process_file_wrapper,
269
- inputs=[file_upload, api_token_input],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  outputs=[
271
- overview_md,
272
- dataframe_view,
273
- insights_md,
274
- charts_plot,
275
- anomalies_md,
276
- anomalies_df_view,
277
  ml_md,
278
- text_analysis_md,
279
  download_btn,
280
  overview_state
281
  ]
282
  )
283
-
284
  example_btn.click(
285
  fn=load_example,
286
- outputs=[file_upload]
287
  )
288
 
289
  if __name__ == "__main__":
 
2
  import gradio as gr
3
  import pandas as pd
4
  import os
5
+ from src.ingestion import load_file, load_hf_dataset
6
  from src.profiling import profile_data, get_overview_text
7
  from src.cleaning import clean_data
8
  from src.anomalies import detect_anomalies
 
10
  from src.llm import get_insights, get_followup_questions, ask_llm, get_ml_recommendations, analyze_text_content
11
 
12
  # Updated analyze_dataset to accept api_token
13
+ def analyze_dataset(file_obj, hf_dataset_name, api_token):
14
+ # Handle HF Dataset loading if name is provided
15
+ if hf_dataset_name:
16
+ df, error, load_log = load_hf_dataset(hf_dataset_name, api_token=api_token)
17
+ elif file_obj:
18
+ df, error, load_log = load_file(file_obj)
19
+ else:
20
  return (
21
+ "## Please upload a file or enter a HF Dataset name to begin.",
22
  pd.DataFrame(),
23
+ "",
24
+ None,
25
+ "",
26
  pd.DataFrame(),
27
  "",
28
+ gr.update(visible=False, value=""), # Text Analysis hidden
29
  None # For download file
30
  )
31
+
 
 
32
  if error:
33
+ return f"## Error: {error}", pd.DataFrame(), "", None, "", pd.DataFrame(), "", gr.update(visible=False, value=""), None
34
+
35
  # 2. Profiling & Cleaning
36
  df_clean, cleaning_log = clean_data(df)
37
  profile = profile_data(df_clean)
38
  overview_text = get_overview_text(profile)
39
+
40
  # 3. Anomalies
41
  anomalies_df, anomaly_summary = detect_anomalies(df_clean)
42
+
43
  # 4. Visualization
44
  chart_figure = generate_charts(df_clean, profile)
45
+
46
  # 5. LLM Insights & Questions
47
  insights = get_insights(overview_text, anomaly_summary, api_token)
48
  ml_recommendations = get_ml_recommendations(overview_text, api_token)
49
+
50
  # 6. Text Analysis (New)
51
+ text_analysis_output = gr.update(visible=False, value="")
52
  # Check for 'Content' column from .txt ingestion or 'Review'/'Text' columns in CSV
53
  text_cols = [col for col in df_clean.columns if col.lower() in ['content', 'text', 'review', 'comments']]
54
  if text_cols:
 
57
  # Get up to 50 lines/samples
58
  samples = df_clean[target_col].dropna().astype(str).tolist()
59
  if samples:
60
+ analysis_result = analyze_text_content(samples, api_token)
61
+ text_analysis_output = gr.update(visible=True, value=analysis_result)
62
+
63
  # Format Outputs
64
  overview_output = f"{overview_text}\n\n"
65
  if load_log:
66
  overview_output += f"{load_log}\n\n"
67
  overview_output += "**Data Cleaning Log:**\n" + "\n".join([f"- {item}" for item in cleaning_log])
68
+
69
  # Save cleaned data for download
70
  output_path = "cleaned_data.csv"
71
  df_clean.to_csv(output_path, index=False)
72
+
73
  return (
74
  overview_output, # Dataset Overview (Markdown)
75
  df_clean.head(), # Dataset Overview (DataFrame)
 
78
  f"### Anomaly Detection Report\n{anomaly_summary}", # Anomalies Markdown
79
  anomalies_df, # Anomalies DataFrame
80
  ml_recommendations, # ML Recommendations
81
+ text_analysis_output, # Text Analysis (Dynamic)
82
  output_path # Download File Path
83
  )
84
 
 
94
  df = pd.DataFrame(dummy_data)
95
  # Add some anomalies
96
  df.loc[6, "Salary"] = 1200000 # outlier
97
+
98
  df.to_csv("example_dataset.csv", index=False)
99
  return "example_dataset.csv"
100
 
 
104
  return ask_llm(message, history, overview_text, api_token)
105
 
106
  # Updated process function wrapper to match inputs/outputs
107
+ def process_data_wrapper(file_obj, hf_dataset, api_token):
108
+ results = analyze_dataset(file_obj, hf_dataset, api_token)
109
  return results + (results[0],) # Append overview_md for the state
110
 
111
  # --- Custom Styling & Theme ---
 
137
  text-align: center;
138
  color: #94a3b8;
139
  font-size: 1.2rem;
140
+ margin-bottom: 1.5rem;
141
  font-weight: 300;
142
  }
143
 
144
+ .feature-highlights {
145
+ text-align: center;
146
+ color: #cbd5e1;
147
+ font-size: 0.95rem;
148
+ margin-bottom: 2rem;
149
+ background: rgba(30, 41, 59, 0.5);
150
+ padding: 10px;
151
+ border-radius: 8px;
152
+ border: 1px solid #334155;
153
+ display: inline-block;
154
+ }
155
+
156
  /* Sidebar Styling */
157
  .sidebar-content {
158
  background: linear-gradient(145deg, #1e293b, #0f172a);
 
227
  with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
228
  gr.Markdown("# ⚡ Auto Data Analyst")
229
  gr.Markdown("<div class='subtitle'>Instant AI Analysis • Professional Insights</div>")
230
+
231
+ # Feature Highlights / Advertisement
232
+ with gr.Row(elem_classes="group"):
233
+ gr.Markdown(
234
+ "<div class='feature-highlights' style='width: 100%;'>"
235
+ "✨ <b>Supports:</b> CSV, Excel, JSON, Parquet, Zip (Smart Selection) & Hugging Face Datasets! 🚀<br>"
236
+ "</div>"
237
+ )
238
+
239
  # State to hold the overview text for the chatbot
240
  overview_state = gr.State()
241
+
242
  with gr.Row():
243
  # Sidebar
244
  with gr.Column(scale=1, elem_classes="sidebar-content"):
245
  gr.Markdown("### 📂 Data Source")
246
+
247
+ with gr.Tabs():
248
+ with gr.TabItem("Upload"):
249
+ file_upload = gr.File(label="Upload File", file_types=[".csv", ".xlsx", ".json", ".parquet", ".txt", ".zip"])
250
+ example_btn = gr.Button("🎲 Load Sample Data", variant="secondary")
251
+
252
+ with gr.TabItem("HF Dataset"):
253
+ hf_input = gr.Textbox(label="Dataset Name", placeholder="e.g. titanic, dair-ai/emotion", info="Loads the 'train' split.")
254
+ hf_load_btn = gr.Button("⬇️ Load Dataset", variant="primary")
255
+
256
  gr.Markdown("---")
257
  gr.Markdown("### 🔐 Authentication")
258
  api_token_input = gr.Textbox(
259
+ label="Hugging Face Token (Optional)",
260
  placeholder="hf_...",
261
  type="password",
262
  info="Paste your token for higher rate limits."
263
  )
264
  gr.Markdown("<small style='color: #64748b;'>Get a free token in your [HF Settings](https://huggingface.co/settings/tokens).</small>")
265
+
266
  # Main Content
267
  with gr.Column(scale=4):
268
  with gr.Tabs():
 
271
  # Removed height from Dataframe to avoid Gradio error
272
  dataframe_view = gr.Dataframe(interactive=False, label="Data Preview")
273
  download_btn = gr.DownloadButton("⬇️ Download Cleaned CSV", label="Download CSV", variant="primary")
274
+
275
  with gr.TabItem("💡 Insights"):
276
  insights_md = gr.Markdown("AI Insights will appear here...")
277
+
278
  with gr.TabItem("📈 Visuals"):
279
  charts_plot = gr.Plot(label="Interactive Dashboard")
280
+
281
  with gr.TabItem("⚠️ Anomalies"):
282
  anomalies_md = gr.Markdown("Anomaly report...")
283
  anomalies_df_view = gr.Dataframe(interactive=False, label="Detected Anomalies")
284
+
285
  with gr.TabItem("🧠 ML Models"):
286
  ml_md = gr.Markdown("ML Recommendations will appear here.")
287
+
288
+ with gr.TabItem("📝 Text Analysis", visible=False) as text_tab:
289
  text_analysis_md = gr.Markdown("Upload a .txt file or dataset with a 'Content/Review' column to see text analysis.")
290
+
291
  with gr.TabItem("💬 Assistant"):
292
  chatbot = gr.ChatInterface(
293
  fn=chat_response,
294
  additional_inputs=[overview_state, api_token_input]
295
  )
296
+
297
+ # Event wiring - File Upload
298
  file_upload.change(
299
+ fn=process_data_wrapper,
300
+ inputs=[file_upload, gr.State(None), api_token_input],
301
+ outputs=[
302
+ overview_md,
303
+ dataframe_view,
304
+ insights_md,
305
+ charts_plot,
306
+ anomalies_md,
307
+ anomalies_df_view,
308
+ ml_md,
309
+ text_tab, # Target the TabItem for visibility
310
+ download_btn,
311
+ overview_state
312
+ ]
313
+ )
314
+
315
+ # Also wire HF Load Button
316
+ hf_load_btn.click(
317
+ fn=process_data_wrapper,
318
+ inputs=[gr.State(None), hf_input, api_token_input],
319
  outputs=[
320
+ overview_md,
321
+ dataframe_view,
322
+ insights_md,
323
+ charts_plot,
324
+ anomalies_md,
325
+ anomalies_df_view,
326
  ml_md,
327
+ text_tab, # Target the TabItem for visibility
328
  download_btn,
329
  overview_state
330
  ]
331
  )
332
+
333
  example_btn.click(
334
  fn=load_example,
335
+ outputs=[file_upload]
336
  )
337
 
338
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -4,5 +4,6 @@ scikit-learn
4
  plotly
5
  gradio
6
  huggingface_hub
 
7
  openpyxl
8
  pyarrow
 
4
  plotly
5
  gradio
6
  huggingface_hub
7
+ datasets
8
  openpyxl
9
  pyarrow
src/__pycache__/ingestion.cpython-313.pyc CHANGED
Binary files a/src/__pycache__/ingestion.cpython-313.pyc and b/src/__pycache__/ingestion.cpython-313.pyc differ
 
src/ingestion.py CHANGED
@@ -122,3 +122,26 @@ def load_file(file_obj):
122
 
123
  except Exception as e:
124
  return None, f"Error loading file: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  except Exception as e:
124
  return None, f"Error loading file: {str(e)}", None
125
+
126
+ def load_hf_dataset(dataset_name, split='train', api_token=None):
127
+ """
128
+ Loads a dataset from Hugging Face Hub.
129
+ """
130
+ try:
131
+ from datasets import load_dataset
132
+
133
+ # Load dataset
134
+ # If config is needed, user might need to specify "dataset_name/config".
135
+ # For now, we try default.
136
+ ds = load_dataset(dataset_name, split=split, token=api_token, trust_remote_code=True)
137
+
138
+ # Convert to pandas
139
+ df = ds.to_pandas()
140
+
141
+ if df.empty:
142
+ return None, f"Dataset '{dataset_name}' (split='{split}') is empty.", None
143
+
144
+ return df, None, f"Loaded Hugging Face Dataset: `{dataset_name}` (Split: {split})"
145
+
146
+ except Exception as e:
147
+ return None, f"Error loading HF Dataset '{dataset_name}': {str(e)}", None
verify_pipeline_mock.py CHANGED
@@ -23,16 +23,21 @@ with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insig
23
  example_path = load_example()
24
  print(f"Example dataset created at: {example_path}")
25
 
26
- print("Running pipeline with MOCKED LLM...")
27
  mock_file = MockFile(example_path)
28
-
 
29
  try:
30
- results = analyze_dataset(mock_file, api_token="test")
 
31
 
32
  # Unpack results to verify types (updated for new return signature)
33
- # (overview_output, df_head, insights, chart, anomaly_md, anomalies_df, ml_recs, text_analysis, download_path)
34
- # NOTE: load_file now returns 3 values, but analyze_dataset still returns 9. The log is inside overview_output.
35
- overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, ml_recs, text_analysis, download_path = results
 
 
 
 
36
 
37
  print("Pipeline finished successfully (Mocked LLM).")
38
  print("✅ Visualization: Charts generated.")
@@ -43,18 +48,23 @@ with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insig
43
 
44
  # 6. Text Analysis
45
  print("Testing Text Analysis (Mock)...")
46
- if text_analysis:
47
- print(f"✅ Text Analysis Result: {text_analysis[:50]}...")
48
  else:
49
  print("ℹ️ No Text Analysis generated (Expected for numeric example).")
50
 
51
  print("\n🎉 Pipeline verification passed!")
52
  print(f"Overview MD Length: {len(overview_md)}")
53
  print(f"Overview DF Shape: {overview_df.shape if hasattr(overview_df, 'shape') else 'None'}")
 
54
  print(f"Chart Object: {type(chart)}")
55
  print(f"Anomalies MD Length: {len(anomalies_md)}")
56
  print(f"Anomalies DF Shape: {anomalies_df.shape if hasattr(anomalies_df, 'shape') else 'None'}")
57
 
 
 
 
 
58
  except Exception as e:
59
  print(f"Pipeline Failed: {e}")
60
  import traceback
 
23
  example_path = load_example()
24
  print(f"Example dataset created at: {example_path}")
25
 
 
26
  mock_file = MockFile(example_path)
27
+ print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")
28
+
29
  try:
30
+ # analyze_dataset signature changed: (file_obj, hf_dataset_name, api_token)
31
+ results = analyze_dataset(mock_file, None, api_token="test")
32
 
33
  # Unpack results to verify types (updated for new return signature)
34
+ # (overview_output, df_head, insights, chart, anomaly_md, anomalies_df, ml_recs, text_analysis_output, download_path)
35
+ # Note: text_analysis_output might be a gr.update dictionary or string depending on context. In app.py it returns gr.update.
36
+ # But analyze_dataset returns the values directly? No, in app.py it returns gr.update for the component.
37
+ # Let's check app.py again...
38
+ # Yes, analyze_dataset returns gr.update(visible=True, value=...)
39
+
40
+ overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, ml_recs, text_analysis_update, download_path = results
41
 
42
  print("Pipeline finished successfully (Mocked LLM).")
43
  print("✅ Visualization: Charts generated.")
 
48
 
49
  # 6. Text Analysis
50
  print("Testing Text Analysis (Mock)...")
51
+ if text_analysis_update:
52
+ print(f"✅ Text Analysis Result: {str(text_analysis_update)[:50]}...")
53
  else:
54
  print("ℹ️ No Text Analysis generated (Expected for numeric example).")
55
 
56
  print("\n🎉 Pipeline verification passed!")
57
  print(f"Overview MD Length: {len(overview_md)}")
58
  print(f"Overview DF Shape: {overview_df.shape if hasattr(overview_df, 'shape') else 'None'}")
59
+ print(f"ML Recs Length: {len(ml_recs)}")
60
  print(f"Chart Object: {type(chart)}")
61
  print(f"Anomalies MD Length: {len(anomalies_md)}")
62
  print(f"Anomalies DF Shape: {anomalies_df.shape if hasattr(anomalies_df, 'shape') else 'None'}")
63
 
64
+ # print(f"Text Analysis: {text_analysis_update}") # Might be a dict or string
65
+
66
+ print(f"Download Path: {download_path}")
67
+
68
  except Exception as e:
69
  print(f"Pipeline Failed: {e}")
70
  import traceback