Spaces:

salihfurkaan
/

auto-data-analyst

Running

App Files Files Community

salihfurkaan commited on Feb 7

Commit

bb9980b

1 Parent(s): bd3247d

demo files

Browse files

Files changed (18) hide show

app.py +132 -0
requirements.txt +8 -0
src/__init__.py +2 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/anomalies.cpython-313.pyc +0 -0
src/__pycache__/cleaning.cpython-313.pyc +0 -0
src/__pycache__/ingestion.cpython-313.pyc +0 -0
src/__pycache__/llm.cpython-313.pyc +0 -0
src/__pycache__/profiling.cpython-313.pyc +0 -0
src/__pycache__/visualization.cpython-313.pyc +0 -0
src/anomalies.py +32 -0
src/cleaning.py +68 -0
src/ingestion.py +50 -0
src/llm.py +57 -0
src/profiling.py +82 -0
src/visualization.py +56 -0
verify_pipeline.py +44 -0
verify_pipeline_mock.py +49 -0

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import gradio as gr
+import pandas as pd
+import os
+from src.ingestion import load_file
+from src.profiling import profile_data, get_overview_text
+from src.cleaning import clean_data
+from src.anomalies import detect_anomalies
+from src.visualization import generate_charts
+from src.llm import get_insights, get_followup_questions
+# Global state to hold the dataframe for chat (if needed in future)
+# For this stateless demo, we process per request.
+def analyze_dataset(file_obj):
+    if file_obj is None:
+        return (
+            "## Please upload a file to begin.",
+            "",
+            None,
+            "",
+            pd.DataFrame(),
+            ""
+        )
+    # 1. Ingestion
+    df, error = load_file(file_obj)
+    if error:
+        return f"## Error: {error}", "", None, "", pd.DataFrame(), ""
+    # 2. Profiling & Cleaning
+    # flexible cleaning: we verify and clean column names for consistent access
+    df_clean, cleaning_log = clean_data(df)
+    profile = profile_data(df_clean)
+    overview_text = get_overview_text(profile)
+    # 3. Anomalies
+    anomalies_df, anomaly_summary = detect_anomalies(df_clean)
+    # 4. Visualization
+    chart_figure = generate_charts(df_clean, profile)
+    # 5. LLM Insights & Questions
+    # We pass the text summaries to the LLM
+    insights = get_insights(overview_text, anomaly_summary)
+    questions = get_followup_questions(overview_text)
+    # Format Outputs
+    overview_output = f"{overview_text}\n\n**Data Cleaning Log:**\n" + "\n".join([f"- {item}" for item in cleaning_log])
+    return (
+        overview_output,        # Dataset Overview (Markdown)
+        df_clean.head(),        # Dataset Overview (DataFrame) matches UI expectation? No, UI has separate MD and DF.
+                                # Wait, function signature in app.py needs to match outputs.
+                                # Let's align with the return statement below.
+        insights,               # Key Insights
+        chart_figure,           # Visual Story
+        f"### Anomaly Detection Report\n{anomaly_summary}", # Anomalies Markdown
+        anomalies_df,           # Anomalies DataFrame
+        questions               # Next Steps
+    )
+def load_example():
+    # Create a dummy CSV for the user to try
+    dummy_data = {
+        "Name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Heidi", "Ivan", "Judy"],
+        "Age": [25, 30, 35, 40, 22, 28, 45, 32, 29, 27],
+        "Salary": [50000, 60000, 75000, 90000, 48000, 52000, 120000, 65000, 58000, 54000],
+        "Department": ["HR", "Engineering", "Engineering", "Management", "HR", "Marketing", "Management", "Engineering", "Marketing", "HR"],
+        "Performance_Score": [3.5, 4.2, 4.8, 3.9, 3.1, 4.0, 4.5, 4.3, 3.8, 4.1]
+    }
+    df = pd.DataFrame(dummy_data)
+    # Add some anomalies
+    df.loc[6, "Salary"] = 1200000 # outlier
+    df.to_csv("example_dataset.csv", index=False)
+    return "example_dataset.csv"
+# Updated process function wrapper to match inputs/outputs
+def process_file_wrapper(file_obj):
+    # Returns: overview_md, overview_df, insights_md, charts_plot, anomalies_md, anomalies_df, questions_md
+    results = analyze_dataset(file_obj)
+    # Results tuple: (overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, questions)
+    return results
+with gr.Blocks(title="Auto Data Analyst", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📊 Auto Data Analyst — No Questions Needed")
+    gr.Markdown("Upload your structured data (CSV, Excel, JSON, Parquet) and get instant professional insights.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_upload = gr.File(label="Upload Dataset", file_types=[".csv", ".xlsx", ".json", ".parquet"])
+            example_btn = gr.Button("Try Example Dataset", variant="secondary")
+        with gr.Column(scale=3):
+            with gr.Tabs():
+                with gr.TabItem("Dataset Overview"):
+                    overview_md = gr.Markdown("Please upload a file to see the overview.")
+                    dataframe_view = gr.Dataframe(interactive=False, label="Data Preview")
+                with gr.TabItem("Key Insights"):
+                    insights_md = gr.Markdown("Insights will appear here.")
+                with gr.TabItem("Visual Story"):
+                    charts_plot = gr.Plot(label="Data Visualization")
+                with gr.TabItem("Anomalies & Outliers"):
+                    anomalies_md = gr.Markdown("Anomaly detection results.")
+                    anomalies_df_view = gr.Dataframe(interactive=False, label="Detected Anomalies")
+                with gr.TabItem("Next Steps"):
+                    questions_md = gr.Markdown("Suggested follow-up questions.")
+    # Event wiring
+    file_upload.change(
+        fn=process_file_wrapper,
+        inputs=[file_upload],
+        outputs=[overview_md, dataframe_view, insights_md, charts_plot, anomalies_md, anomalies_df_view, questions_md]
+    )
+    example_btn.click(
+        fn=load_example,
+        outputs=[file_upload] # This puts the file into the upload component, triggering the change event?
+                              # Gradio File component change triggers when value changes.
+                              # But sometimes programmatic setting requires explicit trigger or just setting value.
+                              # Simplest is to have the button return the file path to the File component.
+                              # And chain the analysis?
+                              # With Gradio, updating the input component usually triggers the event if configured.
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pandas
+numpy
+scikit-learn
+plotly
+gradio
+huggingface_hub
+openpyxl
+pyarrow

src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Auto Data Analyst
2	+ # Internal modules

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (160 Bytes). View file

src/__pycache__/anomalies.cpython-313.pyc ADDED Viewed

Binary file (1.69 kB). View file

src/__pycache__/cleaning.cpython-313.pyc ADDED Viewed

Binary file (3.09 kB). View file

src/__pycache__/ingestion.cpython-313.pyc ADDED Viewed

Binary file (1.96 kB). View file

src/__pycache__/llm.cpython-313.pyc ADDED Viewed

Binary file (2.21 kB). View file

src/__pycache__/profiling.cpython-313.pyc ADDED Viewed

Binary file (4.43 kB). View file

src/__pycache__/visualization.cpython-313.pyc ADDED Viewed

Binary file (2.17 kB). View file

src/anomalies.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import pandas as pd
+from sklearn.ensemble import IsolationForest
+import numpy as np
+def detect_anomalies(df):
+    """
+    Detects anomalies in numerical data using Isolation Forest.
+    Returns a dataframe of anomalies and a summary.
+    """
+    if df is None or df.empty:
+        return pd.DataFrame(), "No data for anomaly detection."
+    # Select numerical columns
+    numeric_df = df.select_dtypes(include=[np.number])
+    if numeric_df.empty:
+        return pd.DataFrame(), "No numerical columns found for anomaly detection."
+    # Fill NaNs if any remain (though cleaning should have handled them, strictly necessary for sklearn)
+    numeric_df = numeric_df.fillna(numeric_df.median())
+    try:
+        # Isolation Forest
+        iso = IsolationForest(contamination=0.05, random_state=42)
+        preds = iso.fit_predict(numeric_df)
+        # -1 indicates anomaly
+        anomalies = df[preds == -1]
+        return anomalies, f"Detected {len(anomalies)} anomalies ({len(anomalies)/len(df):.1%} of data)."
+    except Exception as e:
+        return pd.DataFrame(), f"Anomaly detection failed: {str(e)}"

src/cleaning.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import pandas as pd
+import numpy as np
+def clean_column_names(df):
+    """
+    Standardizes column names to snake_case.
+    """
+    df.columns = df.columns.astype(str).str.lower().str.replace(r'[^\w\s]', '', regex=True).str.replace(r'\s+', '_', regex=True)
+    return df
+def clean_data(df):
+    """
+    Performs basic deterministic cleaning:
+    - Standardizes column names
+    - Drops empty columns and rows
+    - Fills missing values for visualization purposes (optional, or returns a cleaner copy)
+    Returns: cleaned_df, cleaning_log
+    """
+    if df is None or df.empty:
+        return df, []
+    log = []
+    # 1. Clean Column Names
+    old_cols = list(df.columns)
+    df = clean_column_names(df)
+    new_cols = list(df.columns)
+    if old_cols != new_cols:
+        log.append("Standardized column names to snake_case.")
+    # 2. Drop Empty Columns and Rows
+    initial_shape = df.shape
+    df = df.dropna(how='all', axis=1)
+    df = df.dropna(how='all', axis=0)
+    final_shape = df.shape
+    if initial_shape != final_shape:
+        dropped_cols = initial_shape[1] - final_shape[1]
+        dropped_rows = initial_shape[0] - final_shape[0]
+        if dropped_cols > 0:
+            log.append(f"Dropped {dropped_cols} empty columns.")
+        if dropped_rows > 0:
+            log.append(f"Dropped {dropped_rows} empty rows.")
+    # 3. Handle Duplicate Rows
+    duplicates = df.duplicated().sum()
+    if duplicates > 0:
+        df = df.drop_duplicates()
+        log.append(f"Removed {duplicates} duplicate rows.")
+    # 4. Fill Missing Values (Simple Strategy for Analysis)
+    # For numerical: fill with median
+    # For categorical: fill with 'Unknown'
+    # We will return a COPY for analysis to avoid mutating original data structure too much if the user wants raw
+    # But for "Auto Data Analyst", working on cleaned data is usually preferred.
+    # Let's keep it simple and just fill for the returned view, but maybe not aggressively distinct
+    for col in df.columns:
+        if df[col].isnull().sum() > 0:
+            if pd.api.types.is_numeric_dtype(df[col]):
+                fill_val = df[col].median()
+                df[col] = df[col].fillna(fill_val)
+                log.append(f"Filled missing values in '{col}' with median ({fill_val:.2f}).")
+            else:
+                df[col] = df[col].fillna("Unknown")
+                log.append(f"Filled missing values in '{col}' with 'Unknown'.")
+    return df, log

src/ingestion.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pandas as pd
+import os
+def load_file(file_obj):
+    """
+    Loads a file into a Pandas DataFrame.
+    Supports CSV, Excel, JSON, and Parquet.
+    Validates that the data is tabular.
+    """
+    if file_obj is None:
+        return None, "No file uploaded."
+    try:
+        # file_obj is a file-like object or path content depends on Gradio version and config,
+        # but typically file_obj.name gives the path to the temp file
+        file_path = file_obj.name
+        file_ext = os.path.splitext(file_path)[1].lower()
+        if file_ext == '.csv':
+            df = pd.read_csv(file_path)
+        elif file_ext in ['.xlsx', '.xls']:
+            df = pd.read_excel(file_path)
+        elif file_ext == '.json':
+            # Try reading as simple records first, then table
+            try:
+                df = pd.read_json(file_path, orient='records')
+            except ValueError:
+                try:
+                    df = pd.read_json(file_path, orient='table')
+                except ValueError:
+                     # Fallback for other json structures if simple enough
+                    df = pd.read_json(file_path)
+        elif file_ext == '.parquet':
+            df = pd.read_parquet(file_path)
+        else:
+            return None, f"Unsupported file format: {file_ext}. Please upload CSV, Excel, JSON, or Parquet."
+        # Validate tabular structure
+        if df.empty:
+            return None, "The uploaded file is empty."
+        if len(df.columns) < 2:
+             # Soft warning or check? A single column is technically tabular but usually not useful for this tool.
+             # We'll allow it but might be worth noting.
+             pass
+        return df, None
+    except Exception as e:
+        return None, f"Error loading file: {str(e)}"

src/llm.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from huggingface_hub import InferenceClient
+import os
+# Initialize the client - relies on HF_TOKEN specific in environment variables for Spaces
+# If running locally without a token, this might fail or be rate limited.
+# We'll use a robust default model.
+# Adding timeout to prevent hanging indefinitely
+client = InferenceClient("HuggingFaceH4/zephyr-7b-beta", timeout=30)
+def generate_text(prompt, max_new_tokens=512):
+    try:
+        messages = [
+            {"role": "system", "content": "You are a senior data analyst. You provide professional, concise, and accurate insights based on data summaries. You do NOT hallucinate numbers."},
+            {"role": "user", "content": prompt}
+        ]
+        response = client.chat_completion(messages, max_tokens=max_new_tokens)
+        return response.choices[0].message.content
+    except Exception as e:
+        # Graceful fallback if LLM is unavailable or times out
+        return f"Based on the analysis, please review the charts and data profile. (AI Insights unavailable: {str(e)})"
+def get_insights(overview_text, anomalies_text):
+    prompt = f"""
+    Analyze the following dataset summary and anomaly report.
+    Generate 3-5 key professional insights.
+    Focus on data quality, distribution patterns, and potential issues.
+    Do not make up specific values not present in the summary.
+    Data Summary:
+    {overview_text}
+    Anomaly Report:
+    {anomalies_text}
+    Output Format:
+    - Insight 1
+    - Insight 2
+    - Insight 3
+    ...
+    """
+    return generate_text(prompt)
+def get_followup_questions(overview_text):
+    prompt = f"""
+    Based on the following dataset summary, suggest 3-5 relevant follow-up questions
+    that a data analyst should ask to deeper understand the business context or data quality.
+    Data Summary:
+    {overview_text}
+    Output Format:
+    1. Question 1
+    2. Question 2
+    3. Question 3
+    ...
+    """
+    return generate_text(prompt)

src/profiling.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import pandas as pd
+import numpy as np
+def profile_data(df):
+    """
+    Generates a statistical profile of the DataFrame.
+    Returns a dictionary containing key metrics.
+    """
+    if df is None or df.empty:
+        return {}
+    profile = {
+        "rows": len(df),
+        "columns": len(df.columns),
+        "column_names": list(df.columns),
+        "missing_cells": df.isnull().sum().sum(),
+        "missing_cells_percent": (df.isnull().sum().sum() / df.size) * 100,
+        "duplicate_rows": df.duplicated().sum(),
+        "duplicate_rows_percent": (df.duplicated().sum() / len(df)) * 100,
+        "columns_processing": {},
+        "numerical_columns": [],
+        "categorical_columns": [],
+        "datetime_columns": []
+    }
+    for col in df.columns:
+        col_type = str(df[col].dtype)
+        n_unique = df[col].nunique()
+        missing = df[col].isnull().sum()
+        col_profile = {
+            "type": col_type,
+            "unique": n_unique,
+            "missing": missing,
+            "missing_percent": (missing / len(df)) * 100
+        }
+        # Classify and compute specific stats
+        if pd.api.types.is_numeric_dtype(df[col]):
+            profile["numerical_columns"].append(col)
+            col_profile["mean"] = df[col].mean()
+            col_profile["median"] = df[col].median()
+            col_profile["std"] = df[col].std()
+            col_profile["min"] = df[col].min()
+            col_profile["max"] = df[col].max()
+            col_profile["zeros"] = (df[col] == 0).sum()
+        elif pd.api.types.is_datetime64_any_dtype(df[col]):
+            profile["datetime_columns"].append(col)
+            col_profile["min_date"] = df[col].min()
+            col_profile["max_date"] = df[col].max()
+        else:
+            profile["categorical_columns"].append(col)
+            # Top categories
+            try:
+                col_profile["top_categories"] = df[col].value_counts().head(5).to_dict()
+            except:
+                col_profile["top_categories"] = {}
+        profile["columns_processing"][col] = col_profile
+    return profile
+def get_overview_text(profile):
+    """
+    Generates a natural language overview from the profile.
+    """
+    if not profile:
+        return "No data available."
+    overview = f"""
+### Dataset Overview
+- **Rows:** {profile['rows']:,}
+- **Columns:** {profile['columns']}
+- **Missing Values:** {profile['missing_cells']:,} ({profile['missing_cells_percent']:.2f}%)
+- **Duplicates:** {profile['duplicate_rows']:,} ({profile['duplicate_rows_percent']:.2f}%)
+#### Column Types
+- **Numerical:** {len(profile['numerical_columns'])} ({', '.join(profile['numerical_columns'][:3])}{'...' if len(profile['numerical_columns']) > 3 else ''})
+- **Categorical:** {len(profile['categorical_columns'])} ({', '.join(profile['categorical_columns'][:3])}{'...' if len(profile['categorical_columns']) > 3 else ''})
+- **Datetime:** {len(profile['datetime_columns'])}
+    """
+    return overview

src/visualization.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import plotly.express as px
+import plotly.graph_objects as go
+import pandas as pd
+import numpy as np
+def generate_charts(df, profile):
+    """
+    Generates a set of Plotly charts based on the data profile.
+    Returns a Plotly Figure or a list/dict of figures (or a combined subplot for simplicity in Gradio).
+    """
+    if df is None or df.empty:
+        return None
+    # We will create a few key charts
+    figures = []
+    # 1. Correlation Heatmap (Numerical)
+    num_cols = profile['numerical_columns']
+    if len(num_cols) > 1:
+        corr = df[num_cols].corr()
+        fig_corr = px.imshow(corr, text_auto=True, aspect="auto", title="Correlation Matrix")
+        figures.append(fig_corr)
+    # 2. Distributions (Numerical) - Top 3 interesting ones (highest variance?)
+    # For simplicity, just take the first few
+    for col in num_cols[:3]:
+        fig_hist = px.histogram(df, x=col, title=f"Distribution of {col}", marginal="box")
+        figures.append(fig_hist)
+    # 3. Categorical Counts - Top 3
+    cat_cols = profile['categorical_columns']
+    for col in cat_cols[:3]:
+        if df[col].nunique() < 50: # Don't plot high cardinality
+            counts = df[col].value_counts().head(10)
+            fig_bar = px.bar(x=counts.index, y=counts.values, labels={'x': col, 'y': 'Count'}, title=f"Count of {col}")
+            figures.append(fig_bar)
+    # 4. Scatter Plots (if reasonable)
+    if len(num_cols) >= 2:
+        # Scatter of first two numerical columns
+        fig_scat = px.scatter(df, x=num_cols[0], y=num_cols[1], title=f"{num_cols[0]} vs {num_cols[1]}")
+        figures.append(fig_scat)
+    # For Gradio's Plot component, it usually expects a single figure object.
+    # But we want to show multiple. One way is to use subplots, or return a list and have the UI render multiple plots.
+    # The requirement says "Visual Story". Let's try to combine them or just return the "best" one for the main slot,
+    # or arguably, we can return a huge subplot.
+    # PRO-TIP: We can just return the first figure for now in the main slot,
+    # or create a subplot.
+    # Let's return the Correlation Matrix as the "Visual Story" hero if available, otherwise a distribution.
+    if figures:
+        # Return the first one as the hero
+        return figures[0]
+    return None

verify_pipeline.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import pandas as pd
+import sys
+import os
+# Add the project root to sys.path
+sys.path.append(os.getcwd())
+from app import analyze_dataset, load_example
+class MockFile:
+    def __init__(self, path):
+        self.name = path
+print("Generating example dataset...")
+example_path = load_example()
+print(f"Example dataset created at: {example_path}")
+print("Running pipeline...")
+mock_file = MockFile(example_path)
+try:
+    results = analyze_dataset(mock_file)
+    # Unpack results to verify types
+    overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, questions = results
+    print("Pipeline finished successfully.")
+    print(f"Overview MD Length: {len(overview_md)}")
+    print(f"Overview DF Shape: {overview_df.shape if hasattr(overview_df, 'shape') else 'None'}")
+    print(f"Insights: {insights[:50]}...")
+    print(f"Chart Object: {type(chart)}")
+    print(f"Anomalies MD Length: {len(anomalies_md)}")
+    print(f"Anomalies DF Shape: {anomalies_df.shape if hasattr(anomalies_df, 'shape') else 'None'}")
+    print(f"Questions: {questions[:50]}...")
+except Exception as e:
+    print(f"Pipeline Failed: {e}")
+    import traceback
+    traceback.print_exc()
+# Cleanup
+if os.path.exists(example_path):
+    os.remove(example_path)

verify_pipeline_mock.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import pandas as pd
+import sys
+import os
+from unittest.mock import patch
+# Add the project root to sys.path
+sys.path.append(os.getcwd())
+# Mock the LLM module BEFORE importing app
+with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insights, \
+     patch('src.llm.get_followup_questions', return_value="Mocked Questions") as mock_questions:
+    from app import analyze_dataset, load_example
+    class MockFile:
+        def __init__(self, path):
+            self.name = path
+    print("Generating example dataset...")
+    example_path = load_example()
+    print(f"Example dataset created at: {example_path}")
+    print("Running pipeline with MOCKED LLM...")
+    mock_file = MockFile(example_path)
+    try:
+        results = analyze_dataset(mock_file)
+        # Unpack results to verify types
+        overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, questions = results
+        print("Pipeline finished successfully (Mocked LLM).")
+        print(f"Overview MD Length: {len(overview_md)}")
+        print(f"Overview DF Shape: {overview_df.shape if hasattr(overview_df, 'shape') else 'None'}")
+        print(f"Insights: {insights[:50]}...")
+        print(f"Chart Object: {type(chart)}")
+        print(f"Anomalies MD Length: {len(anomalies_md)}")
+        print(f"Anomalies DF Shape: {anomalies_df.shape if hasattr(anomalies_df, 'shape') else 'None'}")
+        print(f"Questions: {questions[:50]}...")
+    except Exception as e:
+        print(f"Pipeline Failed: {e}")
+        import traceback
+        traceback.print_exc()
+    # Cleanup
+    if os.path.exists(example_path):
+        os.remove(example_path)