Spaces:

salihfurkaan
/

auto-data-analyst

Running

App Files Files Community

salihfurkaan commited on Feb 7

Commit

e68d049

1 Parent(s): 90ab796

Add HF Datasets support and dynamic UI improvements

Browse files

Files changed (6) hide show

__pycache__/app.cpython-313.pyc +0 -0
app.py +103 -54
requirements.txt +1 -0
src/__pycache__/ingestion.cpython-313.pyc +0 -0
src/ingestion.py +23 -0
verify_pipeline_mock.py +18 -8

__pycache__/app.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import gradio as gr
 import pandas as pd
 import os
-from src.ingestion import load_file
 from src.profiling import profile_data, get_overview_text
 from src.cleaning import clean_data
 from src.anomalies import detect_anomalies
@@ -10,42 +10,45 @@ from src.visualization import generate_charts
 from src.llm import get_insights, get_followup_questions, ask_llm, get_ml_recommendations, analyze_text_content
 # Updated analyze_dataset to accept api_token
-def analyze_dataset(file_obj, api_token):
-    if file_obj is None:
         return (
-            "## Please upload a file to begin.",
             pd.DataFrame(),
-            "",
-            None,
-            "",
             pd.DataFrame(),
             "",
-            "", # Text Analysis
             None # For download file
         )
-    # 1. Ingestion
-    df, error, load_log = load_file(file_obj)
     if error:
-        return f"## Error: {error}", pd.DataFrame(), "", None, "", pd.DataFrame(), "", "", None
     # 2. Profiling & Cleaning
     df_clean, cleaning_log = clean_data(df)
     profile = profile_data(df_clean)
     overview_text = get_overview_text(profile)
     # 3. Anomalies
     anomalies_df, anomaly_summary = detect_anomalies(df_clean)
     # 4. Visualization
     chart_figure = generate_charts(df_clean, profile)
     # 5. LLM Insights & Questions
     insights = get_insights(overview_text, anomaly_summary, api_token)
     ml_recommendations = get_ml_recommendations(overview_text, api_token)
     # 6. Text Analysis (New)
-    text_analysis = ""
     # Check for 'Content' column from .txt ingestion or 'Review'/'Text' columns in CSV
     text_cols = [col for col in df_clean.columns if col.lower() in ['content', 'text', 'review', 'comments']]
     if text_cols:
@@ -54,18 +57,19 @@ def analyze_dataset(file_obj, api_token):
         # Get up to 50 lines/samples
         samples = df_clean[target_col].dropna().astype(str).tolist()
         if samples:
-            text_analysis = analyze_text_content(samples, api_token)
     # Format Outputs
     overview_output = f"{overview_text}\n\n"
     if load_log:
          overview_output += f"{load_log}\n\n"
     overview_output += "**Data Cleaning Log:**\n" + "\n".join([f"- {item}" for item in cleaning_log])
     # Save cleaned data for download
     output_path = "cleaned_data.csv"
     df_clean.to_csv(output_path, index=False)
     return (
         overview_output,        # Dataset Overview (Markdown)
         df_clean.head(),        # Dataset Overview (DataFrame)
@@ -74,7 +78,7 @@ def analyze_dataset(file_obj, api_token):
         f"### Anomaly Detection Report\n{anomaly_summary}", # Anomalies Markdown
         anomalies_df,           # Anomalies DataFrame
         ml_recommendations,     # ML Recommendations
-        text_analysis,          # Text Analysis (New)
         output_path             # Download File Path
     )
@@ -90,7 +94,7 @@ def load_example():
     df = pd.DataFrame(dummy_data)
     # Add some anomalies
     df.loc[6, "Salary"] = 1200000 # outlier
     df.to_csv("example_dataset.csv", index=False)
     return "example_dataset.csv"
@@ -100,8 +104,8 @@ def chat_response(message, history, overview_text, api_token):
     return ask_llm(message, history, overview_text, api_token)
 # Updated process function wrapper to match inputs/outputs
-def process_file_wrapper(file_obj, api_token):
-    results = analyze_dataset(file_obj, api_token)
     return results + (results[0],) # Append overview_md for the state
 # --- Custom Styling & Theme ---
@@ -133,10 +137,22 @@ h1 {
     text-align: center;
     color: #94a3b8;
     font-size: 1.2rem;
-    margin-bottom: 2.5rem;
     font-weight: 300;
 }
 /* Sidebar Styling */
 .sidebar-content {
     background: linear-gradient(145deg, #1e293b, #0f172a);
@@ -211,27 +227,42 @@ theme = gr.themes.Soft(
 with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
     gr.Markdown("# ⚡ Auto Data Analyst")
     gr.Markdown("<div class='subtitle'>Instant AI Analysis • Professional Insights</div>")
     # State to hold the overview text for the chatbot
     overview_state = gr.State()
     with gr.Row():
         # Sidebar
         with gr.Column(scale=1, elem_classes="sidebar-content"):
             gr.Markdown("### 📂 Data Source")
-            file_upload = gr.File(label="Upload Dataset", file_types=[".csv", ".xlsx", ".json", ".parquet", ".txt", ".zip"])
-            example_btn = gr.Button("🎲 Load Sample Data", variant="secondary")
             gr.Markdown("---")
             gr.Markdown("### 🔐 Authentication")
             api_token_input = gr.Textbox(
-                label="Hugging Face Token (Optional)",
                 placeholder="hf_...",
                 type="password",
                 info="Paste your token for higher rate limits."
             )
             gr.Markdown("<small style='color: #64748b;'>Get a free token in your [HF Settings](https://huggingface.co/settings/tokens).</small>")
         # Main Content
         with gr.Column(scale=4):
             with gr.Tabs():
@@ -240,50 +271,68 @@ with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
                     # Removed height from Dataframe to avoid Gradio error
                     dataframe_view = gr.Dataframe(interactive=False, label="Data Preview")
                     download_btn = gr.DownloadButton("⬇️ Download Cleaned CSV", label="Download CSV", variant="primary")
                 with gr.TabItem("💡 Insights"):
                     insights_md = gr.Markdown("AI Insights will appear here...")
                 with gr.TabItem("📈 Visuals"):
                     charts_plot = gr.Plot(label="Interactive Dashboard")
                 with gr.TabItem("⚠️ Anomalies"):
                     anomalies_md = gr.Markdown("Anomaly report...")
                     anomalies_df_view = gr.Dataframe(interactive=False, label="Detected Anomalies")
                 with gr.TabItem("🧠 ML Models"):
                      ml_md = gr.Markdown("ML Recommendations will appear here.")
-                with gr.TabItem("📝 Text Analysis"):
                      text_analysis_md = gr.Markdown("Upload a .txt file or dataset with a 'Content/Review' column to see text analysis.")
                 with gr.TabItem("💬 Assistant"):
                     chatbot = gr.ChatInterface(
                         fn=chat_response,
                         additional_inputs=[overview_state, api_token_input]
                     )
-    # Event wiring
     file_upload.change(
-        fn=process_file_wrapper,
-        inputs=[file_upload, api_token_input],
         outputs=[
-            overview_md,
-            dataframe_view,
-            insights_md,
-            charts_plot,
-            anomalies_md,
-            anomalies_df_view,
             ml_md,
-            text_analysis_md,
             download_btn,
             overview_state
         ]
     )
     example_btn.click(
         fn=load_example,
-        outputs=[file_upload]
     )
 if __name__ == "__main__":

 import gradio as gr
 import pandas as pd
 import os
+from src.ingestion import load_file, load_hf_dataset
 from src.profiling import profile_data, get_overview_text
 from src.cleaning import clean_data
 from src.anomalies import detect_anomalies
 from src.llm import get_insights, get_followup_questions, ask_llm, get_ml_recommendations, analyze_text_content
 # Updated analyze_dataset to accept api_token
+def analyze_dataset(file_obj, hf_dataset_name, api_token):
+    # Handle HF Dataset loading if name is provided
+    if hf_dataset_name:
+        df, error, load_log = load_hf_dataset(hf_dataset_name, api_token=api_token)
+    elif file_obj:
+        df, error, load_log = load_file(file_obj)
+    else:
         return (
+            "## Please upload a file or enter a HF Dataset name to begin.",
             pd.DataFrame(),
+            "",
+            None,
+            "",
             pd.DataFrame(),
             "",
+            gr.update(visible=False, value=""), # Text Analysis hidden
             None # For download file
         )
     if error:
+        return f"## Error: {error}", pd.DataFrame(), "", None, "", pd.DataFrame(), "", gr.update(visible=False, value=""), None
     # 2. Profiling & Cleaning
     df_clean, cleaning_log = clean_data(df)
     profile = profile_data(df_clean)
     overview_text = get_overview_text(profile)
     # 3. Anomalies
     anomalies_df, anomaly_summary = detect_anomalies(df_clean)
     # 4. Visualization
     chart_figure = generate_charts(df_clean, profile)
     # 5. LLM Insights & Questions
     insights = get_insights(overview_text, anomaly_summary, api_token)
     ml_recommendations = get_ml_recommendations(overview_text, api_token)
     # 6. Text Analysis (New)
+    text_analysis_output = gr.update(visible=False, value="")
     # Check for 'Content' column from .txt ingestion or 'Review'/'Text' columns in CSV
     text_cols = [col for col in df_clean.columns if col.lower() in ['content', 'text', 'review', 'comments']]
     if text_cols:
         # Get up to 50 lines/samples
         samples = df_clean[target_col].dropna().astype(str).tolist()
         if samples:
+            analysis_result = analyze_text_content(samples, api_token)
+            text_analysis_output = gr.update(visible=True, value=analysis_result)
     # Format Outputs
     overview_output = f"{overview_text}\n\n"
     if load_log:
          overview_output += f"{load_log}\n\n"
     overview_output += "**Data Cleaning Log:**\n" + "\n".join([f"- {item}" for item in cleaning_log])
     # Save cleaned data for download
     output_path = "cleaned_data.csv"
     df_clean.to_csv(output_path, index=False)
     return (
         overview_output,        # Dataset Overview (Markdown)
         df_clean.head(),        # Dataset Overview (DataFrame)
         f"### Anomaly Detection Report\n{anomaly_summary}", # Anomalies Markdown
         anomalies_df,           # Anomalies DataFrame
         ml_recommendations,     # ML Recommendations
+        text_analysis_output,   # Text Analysis (Dynamic)
         output_path             # Download File Path
     )
     df = pd.DataFrame(dummy_data)
     # Add some anomalies
     df.loc[6, "Salary"] = 1200000 # outlier
     df.to_csv("example_dataset.csv", index=False)
     return "example_dataset.csv"
     return ask_llm(message, history, overview_text, api_token)
 # Updated process function wrapper to match inputs/outputs
+def process_data_wrapper(file_obj, hf_dataset, api_token):
+    results = analyze_dataset(file_obj, hf_dataset, api_token)
     return results + (results[0],) # Append overview_md for the state
 # --- Custom Styling & Theme ---
     text-align: center;
     color: #94a3b8;
     font-size: 1.2rem;
+    margin-bottom: 1.5rem;
     font-weight: 300;
 }
+.feature-highlights {
+    text-align: center;
+    color: #cbd5e1;
+    font-size: 0.95rem;
+    margin-bottom: 2rem;
+    background: rgba(30, 41, 59, 0.5);
+    padding: 10px;
+    border-radius: 8px;
+    border: 1px solid #334155;
+    display: inline-block;
+}
 /* Sidebar Styling */
 .sidebar-content {
     background: linear-gradient(145deg, #1e293b, #0f172a);
 with gr.Blocks(title="Auto Data Analyst", theme=theme, css=custom_css) as demo:
     gr.Markdown("# ⚡ Auto Data Analyst")
     gr.Markdown("<div class='subtitle'>Instant AI Analysis • Professional Insights</div>")
+    # Feature Highlights / Advertisement
+    with gr.Row(elem_classes="group"):
+        gr.Markdown(
+            "<div class='feature-highlights' style='width: 100%;'>"
+            "✨ <b>Supports:</b> CSV, Excel, JSON, Parquet, Zip (Smart Selection) & Hugging Face Datasets! 🚀<br>"
+            "</div>"
+        )
     # State to hold the overview text for the chatbot
     overview_state = gr.State()
     with gr.Row():
         # Sidebar
         with gr.Column(scale=1, elem_classes="sidebar-content"):
             gr.Markdown("### 📂 Data Source")
+            with gr.Tabs():
+                with gr.TabItem("Upload"):
+                    file_upload = gr.File(label="Upload File", file_types=[".csv", ".xlsx", ".json", ".parquet", ".txt", ".zip"])
+                    example_btn = gr.Button("🎲 Load Sample Data", variant="secondary")
+                with gr.TabItem("HF Dataset"):
+                    hf_input = gr.Textbox(label="Dataset Name", placeholder="e.g. titanic, dair-ai/emotion", info="Loads the 'train' split.")
+                    hf_load_btn = gr.Button("⬇️ Load Dataset", variant="primary")
             gr.Markdown("---")
             gr.Markdown("### 🔐 Authentication")
             api_token_input = gr.Textbox(
+                label="Hugging Face Token (Optional)",
                 placeholder="hf_...",
                 type="password",
                 info="Paste your token for higher rate limits."
             )
             gr.Markdown("<small style='color: #64748b;'>Get a free token in your [HF Settings](https://huggingface.co/settings/tokens).</small>")
         # Main Content
         with gr.Column(scale=4):
             with gr.Tabs():
                     # Removed height from Dataframe to avoid Gradio error
                     dataframe_view = gr.Dataframe(interactive=False, label="Data Preview")
                     download_btn = gr.DownloadButton("⬇️ Download Cleaned CSV", label="Download CSV", variant="primary")
                 with gr.TabItem("💡 Insights"):
                     insights_md = gr.Markdown("AI Insights will appear here...")
                 with gr.TabItem("📈 Visuals"):
                     charts_plot = gr.Plot(label="Interactive Dashboard")
                 with gr.TabItem("⚠️ Anomalies"):
                     anomalies_md = gr.Markdown("Anomaly report...")
                     anomalies_df_view = gr.Dataframe(interactive=False, label="Detected Anomalies")
                 with gr.TabItem("🧠 ML Models"):
                      ml_md = gr.Markdown("ML Recommendations will appear here.")
+                with gr.TabItem("📝 Text Analysis", visible=False) as text_tab:
                      text_analysis_md = gr.Markdown("Upload a .txt file or dataset with a 'Content/Review' column to see text analysis.")
                 with gr.TabItem("💬 Assistant"):
                     chatbot = gr.ChatInterface(
                         fn=chat_response,
                         additional_inputs=[overview_state, api_token_input]
                     )
+    # Event wiring - File Upload
     file_upload.change(
+        fn=process_data_wrapper,
+        inputs=[file_upload, gr.State(None), api_token_input],
+        outputs=[
+            overview_md,
+            dataframe_view,
+            insights_md,
+            charts_plot,
+            anomalies_md,
+            anomalies_df_view,
+            ml_md,
+            text_tab, # Target the TabItem for visibility
+            download_btn,
+            overview_state
+        ]
+    )
+    # Also wire HF Load Button
+    hf_load_btn.click(
+        fn=process_data_wrapper,
+        inputs=[gr.State(None), hf_input, api_token_input],
         outputs=[
+            overview_md,
+            dataframe_view,
+            insights_md,
+            charts_plot,
+            anomalies_md,
+            anomalies_df_view,
             ml_md,
+            text_tab, # Target the TabItem for visibility
             download_btn,
             overview_state
         ]
     )
     example_btn.click(
         fn=load_example,
+        outputs=[file_upload]
     )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -4,5 +4,6 @@ scikit-learn
 plotly
 gradio
 huggingface_hub
 openpyxl
 pyarrow

 plotly
 gradio
 huggingface_hub
+datasets
 openpyxl
 pyarrow

src/__pycache__/ingestion.cpython-313.pyc CHANGED Viewed

Binary files a/src/__pycache__/ingestion.cpython-313.pyc and b/src/__pycache__/ingestion.cpython-313.pyc differ

src/ingestion.py CHANGED Viewed

@@ -122,3 +122,26 @@ def load_file(file_obj):
     except Exception as e:
         return None, f"Error loading file: {str(e)}", None

     except Exception as e:
         return None, f"Error loading file: {str(e)}", None
+def load_hf_dataset(dataset_name, split='train', api_token=None):
+    """
+    Loads a dataset from Hugging Face Hub.
+    """
+    try:
+        from datasets import load_dataset
+        # Load dataset
+        # If config is needed, user might need to specify "dataset_name/config".
+        # For now, we try default.
+        ds = load_dataset(dataset_name, split=split, token=api_token, trust_remote_code=True)
+        # Convert to pandas
+        df = ds.to_pandas()
+        if df.empty:
+             return None, f"Dataset '{dataset_name}' (split='{split}') is empty.", None
+        return df, None, f"Loaded Hugging Face Dataset: `{dataset_name}` (Split: {split})"
+    except Exception as e:
+        return None, f"Error loading HF Dataset '{dataset_name}': {str(e)}", None

verify_pipeline_mock.py CHANGED Viewed

@@ -23,16 +23,21 @@ with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insig
     example_path = load_example()
     print(f"Example dataset created at: {example_path}")
-    print("Running pipeline with MOCKED LLM...")
     mock_file = MockFile(example_path)
     try:
-        results = analyze_dataset(mock_file, api_token="test")
         # Unpack results to verify types (updated for new return signature)
-        # (overview_output, df_head, insights, chart, anomaly_md, anomalies_df, ml_recs, text_analysis, download_path)
-        # NOTE: load_file now returns 3 values, but analyze_dataset still returns 9. The log is inside overview_output.
-        overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, ml_recs, text_analysis, download_path = results
         print("Pipeline finished successfully (Mocked LLM).")
         print("✅ Visualization: Charts generated.")
@@ -43,18 +48,23 @@ with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insig
         # 6. Text Analysis
         print("Testing Text Analysis (Mock)...")
-        if text_analysis:
-             print(f"✅ Text Analysis Result: {text_analysis[:50]}...")
         else:
              print("ℹ️ No Text Analysis generated (Expected for numeric example).")
         print("\n🎉 Pipeline verification passed!")
         print(f"Overview MD Length: {len(overview_md)}")
         print(f"Overview DF Shape: {overview_df.shape if hasattr(overview_df, 'shape') else 'None'}")
         print(f"Chart Object: {type(chart)}")
         print(f"Anomalies MD Length: {len(anomalies_md)}")
         print(f"Anomalies DF Shape: {anomalies_df.shape if hasattr(anomalies_df, 'shape') else 'None'}")
     except Exception as e:
         print(f"Pipeline Failed: {e}")
         import traceback

     example_path = load_example()
     print(f"Example dataset created at: {example_path}")
     mock_file = MockFile(example_path)
+    print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")
     try:
+        # analyze_dataset signature changed: (file_obj, hf_dataset_name, api_token)
+        results = analyze_dataset(mock_file, None, api_token="test")
         # Unpack results to verify types (updated for new return signature)
+        # (overview_output, df_head, insights, chart, anomaly_md, anomalies_df, ml_recs, text_analysis_output, download_path)
+        # Note: text_analysis_output might be a gr.update dictionary or string depending on context. In app.py it returns gr.update.
+        # But analyze_dataset returns the values directly? No, in app.py it returns gr.update for the component.
+        # Let's check app.py again...
+        # Yes, analyze_dataset returns gr.update(visible=True, value=...)
+        overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, ml_recs, text_analysis_update, download_path = results
         print("Pipeline finished successfully (Mocked LLM).")
         print("✅ Visualization: Charts generated.")
         # 6. Text Analysis
         print("Testing Text Analysis (Mock)...")
+        if text_analysis_update:
+             print(f"✅ Text Analysis Result: {str(text_analysis_update)[:50]}...")
         else:
              print("ℹ️ No Text Analysis generated (Expected for numeric example).")
         print("\n🎉 Pipeline verification passed!")
         print(f"Overview MD Length: {len(overview_md)}")
         print(f"Overview DF Shape: {overview_df.shape if hasattr(overview_df, 'shape') else 'None'}")
+        print(f"ML Recs Length: {len(ml_recs)}")
         print(f"Chart Object: {type(chart)}")
         print(f"Anomalies MD Length: {len(anomalies_md)}")
         print(f"Anomalies DF Shape: {anomalies_df.shape if hasattr(anomalies_df, 'shape') else 'None'}")
+        # print(f"Text Analysis: {text_analysis_update}") # Might be a dict or string
+        print(f"Download Path: {download_path}")
     except Exception as e:
         print(f"Pipeline Failed: {e}")
         import traceback