Spaces:
Running
Running
Commit ·
bb9980b
1
Parent(s): bd3247d
demo files
Browse files- app.py +132 -0
- requirements.txt +8 -0
- src/__init__.py +2 -0
- src/__pycache__/__init__.cpython-313.pyc +0 -0
- src/__pycache__/anomalies.cpython-313.pyc +0 -0
- src/__pycache__/cleaning.cpython-313.pyc +0 -0
- src/__pycache__/ingestion.cpython-313.pyc +0 -0
- src/__pycache__/llm.cpython-313.pyc +0 -0
- src/__pycache__/profiling.cpython-313.pyc +0 -0
- src/__pycache__/visualization.cpython-313.pyc +0 -0
- src/anomalies.py +32 -0
- src/cleaning.py +68 -0
- src/ingestion.py +50 -0
- src/llm.py +57 -0
- src/profiling.py +82 -0
- src/visualization.py +56 -0
- verify_pipeline.py +44 -0
- verify_pipeline_mock.py +49 -0
app.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import os
|
| 5 |
+
from src.ingestion import load_file
|
| 6 |
+
from src.profiling import profile_data, get_overview_text
|
| 7 |
+
from src.cleaning import clean_data
|
| 8 |
+
from src.anomalies import detect_anomalies
|
| 9 |
+
from src.visualization import generate_charts
|
| 10 |
+
from src.llm import get_insights, get_followup_questions
|
| 11 |
+
|
| 12 |
+
# Global state to hold the dataframe for chat (if needed in future)
|
| 13 |
+
# For this stateless demo, we process per request.
|
| 14 |
+
|
| 15 |
+
def analyze_dataset(file_obj):
|
| 16 |
+
if file_obj is None:
|
| 17 |
+
return (
|
| 18 |
+
"## Please upload a file to begin.",
|
| 19 |
+
"",
|
| 20 |
+
None,
|
| 21 |
+
"",
|
| 22 |
+
pd.DataFrame(),
|
| 23 |
+
""
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# 1. Ingestion
|
| 27 |
+
df, error = load_file(file_obj)
|
| 28 |
+
if error:
|
| 29 |
+
return f"## Error: {error}", "", None, "", pd.DataFrame(), ""
|
| 30 |
+
|
| 31 |
+
# 2. Profiling & Cleaning
|
| 32 |
+
# flexible cleaning: we verify and clean column names for consistent access
|
| 33 |
+
df_clean, cleaning_log = clean_data(df)
|
| 34 |
+
profile = profile_data(df_clean)
|
| 35 |
+
overview_text = get_overview_text(profile)
|
| 36 |
+
|
| 37 |
+
# 3. Anomalies
|
| 38 |
+
anomalies_df, anomaly_summary = detect_anomalies(df_clean)
|
| 39 |
+
|
| 40 |
+
# 4. Visualization
|
| 41 |
+
chart_figure = generate_charts(df_clean, profile)
|
| 42 |
+
|
| 43 |
+
# 5. LLM Insights & Questions
|
| 44 |
+
# We pass the text summaries to the LLM
|
| 45 |
+
insights = get_insights(overview_text, anomaly_summary)
|
| 46 |
+
questions = get_followup_questions(overview_text)
|
| 47 |
+
|
| 48 |
+
# Format Outputs
|
| 49 |
+
overview_output = f"{overview_text}\n\n**Data Cleaning Log:**\n" + "\n".join([f"- {item}" for item in cleaning_log])
|
| 50 |
+
|
| 51 |
+
return (
|
| 52 |
+
overview_output, # Dataset Overview (Markdown)
|
| 53 |
+
df_clean.head(), # Dataset Overview (DataFrame) matches UI expectation? No, UI has separate MD and DF.
|
| 54 |
+
# Wait, function signature in app.py needs to match outputs.
|
| 55 |
+
# Let's align with the return statement below.
|
| 56 |
+
insights, # Key Insights
|
| 57 |
+
chart_figure, # Visual Story
|
| 58 |
+
f"### Anomaly Detection Report\n{anomaly_summary}", # Anomalies Markdown
|
| 59 |
+
anomalies_df, # Anomalies DataFrame
|
| 60 |
+
questions # Next Steps
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
def load_example():
|
| 64 |
+
# Create a dummy CSV for the user to try
|
| 65 |
+
dummy_data = {
|
| 66 |
+
"Name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Heidi", "Ivan", "Judy"],
|
| 67 |
+
"Age": [25, 30, 35, 40, 22, 28, 45, 32, 29, 27],
|
| 68 |
+
"Salary": [50000, 60000, 75000, 90000, 48000, 52000, 120000, 65000, 58000, 54000],
|
| 69 |
+
"Department": ["HR", "Engineering", "Engineering", "Management", "HR", "Marketing", "Management", "Engineering", "Marketing", "HR"],
|
| 70 |
+
"Performance_Score": [3.5, 4.2, 4.8, 3.9, 3.1, 4.0, 4.5, 4.3, 3.8, 4.1]
|
| 71 |
+
}
|
| 72 |
+
df = pd.DataFrame(dummy_data)
|
| 73 |
+
# Add some anomalies
|
| 74 |
+
df.loc[6, "Salary"] = 1200000 # outlier
|
| 75 |
+
|
| 76 |
+
df.to_csv("example_dataset.csv", index=False)
|
| 77 |
+
return "example_dataset.csv"
|
| 78 |
+
|
| 79 |
+
# Updated process function wrapper to match inputs/outputs
|
| 80 |
+
def process_file_wrapper(file_obj):
|
| 81 |
+
# Returns: overview_md, overview_df, insights_md, charts_plot, anomalies_md, anomalies_df, questions_md
|
| 82 |
+
results = analyze_dataset(file_obj)
|
| 83 |
+
# Results tuple: (overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, questions)
|
| 84 |
+
return results
|
| 85 |
+
|
| 86 |
+
with gr.Blocks(title="Auto Data Analyst", theme=gr.themes.Soft()) as demo:
|
| 87 |
+
gr.Markdown("# 📊 Auto Data Analyst — No Questions Needed")
|
| 88 |
+
gr.Markdown("Upload your structured data (CSV, Excel, JSON, Parquet) and get instant professional insights.")
|
| 89 |
+
|
| 90 |
+
with gr.Row():
|
| 91 |
+
with gr.Column(scale=1):
|
| 92 |
+
file_upload = gr.File(label="Upload Dataset", file_types=[".csv", ".xlsx", ".json", ".parquet"])
|
| 93 |
+
example_btn = gr.Button("Try Example Dataset", variant="secondary")
|
| 94 |
+
|
| 95 |
+
with gr.Column(scale=3):
|
| 96 |
+
with gr.Tabs():
|
| 97 |
+
with gr.TabItem("Dataset Overview"):
|
| 98 |
+
overview_md = gr.Markdown("Please upload a file to see the overview.")
|
| 99 |
+
dataframe_view = gr.Dataframe(interactive=False, label="Data Preview")
|
| 100 |
+
|
| 101 |
+
with gr.TabItem("Key Insights"):
|
| 102 |
+
insights_md = gr.Markdown("Insights will appear here.")
|
| 103 |
+
|
| 104 |
+
with gr.TabItem("Visual Story"):
|
| 105 |
+
charts_plot = gr.Plot(label="Data Visualization")
|
| 106 |
+
|
| 107 |
+
with gr.TabItem("Anomalies & Outliers"):
|
| 108 |
+
anomalies_md = gr.Markdown("Anomaly detection results.")
|
| 109 |
+
anomalies_df_view = gr.Dataframe(interactive=False, label="Detected Anomalies")
|
| 110 |
+
|
| 111 |
+
with gr.TabItem("Next Steps"):
|
| 112 |
+
questions_md = gr.Markdown("Suggested follow-up questions.")
|
| 113 |
+
|
| 114 |
+
# Event wiring
|
| 115 |
+
file_upload.change(
|
| 116 |
+
fn=process_file_wrapper,
|
| 117 |
+
inputs=[file_upload],
|
| 118 |
+
outputs=[overview_md, dataframe_view, insights_md, charts_plot, anomalies_md, anomalies_df_view, questions_md]
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
example_btn.click(
|
| 122 |
+
fn=load_example,
|
| 123 |
+
outputs=[file_upload] # This puts the file into the upload component, triggering the change event?
|
| 124 |
+
# Gradio File component change triggers when value changes.
|
| 125 |
+
# But sometimes programmatic setting requires explicit trigger or just setting value.
|
| 126 |
+
# Simplest is to have the button return the file path to the File component.
|
| 127 |
+
# And chain the analysis?
|
| 128 |
+
# With Gradio, updating the input component usually triggers the event if configured.
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
if __name__ == "__main__":
|
| 132 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
numpy
|
| 3 |
+
scikit-learn
|
| 4 |
+
plotly
|
| 5 |
+
gradio
|
| 6 |
+
huggingface_hub
|
| 7 |
+
openpyxl
|
| 8 |
+
pyarrow
|
src/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Auto Data Analyst
|
| 2 |
+
# Internal modules
|
src/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (160 Bytes). View file
|
|
|
src/__pycache__/anomalies.cpython-313.pyc
ADDED
|
Binary file (1.69 kB). View file
|
|
|
src/__pycache__/cleaning.cpython-313.pyc
ADDED
|
Binary file (3.09 kB). View file
|
|
|
src/__pycache__/ingestion.cpython-313.pyc
ADDED
|
Binary file (1.96 kB). View file
|
|
|
src/__pycache__/llm.cpython-313.pyc
ADDED
|
Binary file (2.21 kB). View file
|
|
|
src/__pycache__/profiling.cpython-313.pyc
ADDED
|
Binary file (4.43 kB). View file
|
|
|
src/__pycache__/visualization.cpython-313.pyc
ADDED
|
Binary file (2.17 kB). View file
|
|
|
src/anomalies.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.ensemble import IsolationForest
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
def detect_anomalies(df):
|
| 6 |
+
"""
|
| 7 |
+
Detects anomalies in numerical data using Isolation Forest.
|
| 8 |
+
Returns a dataframe of anomalies and a summary.
|
| 9 |
+
"""
|
| 10 |
+
if df is None or df.empty:
|
| 11 |
+
return pd.DataFrame(), "No data for anomaly detection."
|
| 12 |
+
|
| 13 |
+
# Select numerical columns
|
| 14 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
| 15 |
+
|
| 16 |
+
if numeric_df.empty:
|
| 17 |
+
return pd.DataFrame(), "No numerical columns found for anomaly detection."
|
| 18 |
+
|
| 19 |
+
# Fill NaNs if any remain (though cleaning should have handled them, strictly necessary for sklearn)
|
| 20 |
+
numeric_df = numeric_df.fillna(numeric_df.median())
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
# Isolation Forest
|
| 24 |
+
iso = IsolationForest(contamination=0.05, random_state=42)
|
| 25 |
+
preds = iso.fit_predict(numeric_df)
|
| 26 |
+
|
| 27 |
+
# -1 indicates anomaly
|
| 28 |
+
anomalies = df[preds == -1]
|
| 29 |
+
|
| 30 |
+
return anomalies, f"Detected {len(anomalies)} anomalies ({len(anomalies)/len(df):.1%} of data)."
|
| 31 |
+
except Exception as e:
|
| 32 |
+
return pd.DataFrame(), f"Anomaly detection failed: {str(e)}"
|
src/cleaning.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
def clean_column_names(df):
|
| 5 |
+
"""
|
| 6 |
+
Standardizes column names to snake_case.
|
| 7 |
+
"""
|
| 8 |
+
df.columns = df.columns.astype(str).str.lower().str.replace(r'[^\w\s]', '', regex=True).str.replace(r'\s+', '_', regex=True)
|
| 9 |
+
return df
|
| 10 |
+
|
| 11 |
+
def clean_data(df):
|
| 12 |
+
"""
|
| 13 |
+
Performs basic deterministic cleaning:
|
| 14 |
+
- Standardizes column names
|
| 15 |
+
- Drops empty columns and rows
|
| 16 |
+
- Fills missing values for visualization purposes (optional, or returns a cleaner copy)
|
| 17 |
+
Returns: cleaned_df, cleaning_log
|
| 18 |
+
"""
|
| 19 |
+
if df is None or df.empty:
|
| 20 |
+
return df, []
|
| 21 |
+
|
| 22 |
+
log = []
|
| 23 |
+
|
| 24 |
+
# 1. Clean Column Names
|
| 25 |
+
old_cols = list(df.columns)
|
| 26 |
+
df = clean_column_names(df)
|
| 27 |
+
new_cols = list(df.columns)
|
| 28 |
+
if old_cols != new_cols:
|
| 29 |
+
log.append("Standardized column names to snake_case.")
|
| 30 |
+
|
| 31 |
+
# 2. Drop Empty Columns and Rows
|
| 32 |
+
initial_shape = df.shape
|
| 33 |
+
df = df.dropna(how='all', axis=1)
|
| 34 |
+
df = df.dropna(how='all', axis=0)
|
| 35 |
+
final_shape = df.shape
|
| 36 |
+
|
| 37 |
+
if initial_shape != final_shape:
|
| 38 |
+
dropped_cols = initial_shape[1] - final_shape[1]
|
| 39 |
+
dropped_rows = initial_shape[0] - final_shape[0]
|
| 40 |
+
if dropped_cols > 0:
|
| 41 |
+
log.append(f"Dropped {dropped_cols} empty columns.")
|
| 42 |
+
if dropped_rows > 0:
|
| 43 |
+
log.append(f"Dropped {dropped_rows} empty rows.")
|
| 44 |
+
|
| 45 |
+
# 3. Handle Duplicate Rows
|
| 46 |
+
duplicates = df.duplicated().sum()
|
| 47 |
+
if duplicates > 0:
|
| 48 |
+
df = df.drop_duplicates()
|
| 49 |
+
log.append(f"Removed {duplicates} duplicate rows.")
|
| 50 |
+
|
| 51 |
+
# 4. Fill Missing Values (Simple Strategy for Analysis)
|
| 52 |
+
# For numerical: fill with median
|
| 53 |
+
# For categorical: fill with 'Unknown'
|
| 54 |
+
# We will return a COPY for analysis to avoid mutating original data structure too much if the user wants raw
|
| 55 |
+
# But for "Auto Data Analyst", working on cleaned data is usually preferred.
|
| 56 |
+
|
| 57 |
+
# Let's keep it simple and just fill for the returned view, but maybe not aggressively distinct
|
| 58 |
+
for col in df.columns:
|
| 59 |
+
if df[col].isnull().sum() > 0:
|
| 60 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
| 61 |
+
fill_val = df[col].median()
|
| 62 |
+
df[col] = df[col].fillna(fill_val)
|
| 63 |
+
log.append(f"Filled missing values in '{col}' with median ({fill_val:.2f}).")
|
| 64 |
+
else:
|
| 65 |
+
df[col] = df[col].fillna("Unknown")
|
| 66 |
+
log.append(f"Filled missing values in '{col}' with 'Unknown'.")
|
| 67 |
+
|
| 68 |
+
return df, log
|
src/ingestion.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
def load_file(file_obj):
|
| 5 |
+
"""
|
| 6 |
+
Loads a file into a Pandas DataFrame.
|
| 7 |
+
Supports CSV, Excel, JSON, and Parquet.
|
| 8 |
+
Validates that the data is tabular.
|
| 9 |
+
"""
|
| 10 |
+
if file_obj is None:
|
| 11 |
+
return None, "No file uploaded."
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
# file_obj is a file-like object or path content depends on Gradio version and config,
|
| 15 |
+
# but typically file_obj.name gives the path to the temp file
|
| 16 |
+
file_path = file_obj.name
|
| 17 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
| 18 |
+
|
| 19 |
+
if file_ext == '.csv':
|
| 20 |
+
df = pd.read_csv(file_path)
|
| 21 |
+
elif file_ext in ['.xlsx', '.xls']:
|
| 22 |
+
df = pd.read_excel(file_path)
|
| 23 |
+
elif file_ext == '.json':
|
| 24 |
+
# Try reading as simple records first, then table
|
| 25 |
+
try:
|
| 26 |
+
df = pd.read_json(file_path, orient='records')
|
| 27 |
+
except ValueError:
|
| 28 |
+
try:
|
| 29 |
+
df = pd.read_json(file_path, orient='table')
|
| 30 |
+
except ValueError:
|
| 31 |
+
# Fallback for other json structures if simple enough
|
| 32 |
+
df = pd.read_json(file_path)
|
| 33 |
+
elif file_ext == '.parquet':
|
| 34 |
+
df = pd.read_parquet(file_path)
|
| 35 |
+
else:
|
| 36 |
+
return None, f"Unsupported file format: {file_ext}. Please upload CSV, Excel, JSON, or Parquet."
|
| 37 |
+
|
| 38 |
+
# Validate tabular structure
|
| 39 |
+
if df.empty:
|
| 40 |
+
return None, "The uploaded file is empty."
|
| 41 |
+
|
| 42 |
+
if len(df.columns) < 2:
|
| 43 |
+
# Soft warning or check? A single column is technically tabular but usually not useful for this tool.
|
| 44 |
+
# We'll allow it but might be worth noting.
|
| 45 |
+
pass
|
| 46 |
+
|
| 47 |
+
return df, None
|
| 48 |
+
|
| 49 |
+
except Exception as e:
|
| 50 |
+
return None, f"Error loading file: {str(e)}"
|
src/llm.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import InferenceClient
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Initialize the client - relies on HF_TOKEN specific in environment variables for Spaces
|
| 5 |
+
# If running locally without a token, this might fail or be rate limited.
|
| 6 |
+
# We'll use a robust default model.
|
| 7 |
+
# Adding timeout to prevent hanging indefinitely
|
| 8 |
+
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta", timeout=30)
|
| 9 |
+
|
| 10 |
+
def generate_text(prompt, max_new_tokens=512):
|
| 11 |
+
try:
|
| 12 |
+
messages = [
|
| 13 |
+
{"role": "system", "content": "You are a senior data analyst. You provide professional, concise, and accurate insights based on data summaries. You do NOT hallucinate numbers."},
|
| 14 |
+
{"role": "user", "content": prompt}
|
| 15 |
+
]
|
| 16 |
+
response = client.chat_completion(messages, max_tokens=max_new_tokens)
|
| 17 |
+
return response.choices[0].message.content
|
| 18 |
+
except Exception as e:
|
| 19 |
+
# Graceful fallback if LLM is unavailable or times out
|
| 20 |
+
return f"Based on the analysis, please review the charts and data profile. (AI Insights unavailable: {str(e)})"
|
| 21 |
+
|
| 22 |
+
def get_insights(overview_text, anomalies_text):
|
| 23 |
+
prompt = f"""
|
| 24 |
+
Analyze the following dataset summary and anomaly report.
|
| 25 |
+
Generate 3-5 key professional insights.
|
| 26 |
+
Focus on data quality, distribution patterns, and potential issues.
|
| 27 |
+
Do not make up specific values not present in the summary.
|
| 28 |
+
|
| 29 |
+
Data Summary:
|
| 30 |
+
{overview_text}
|
| 31 |
+
|
| 32 |
+
Anomaly Report:
|
| 33 |
+
{anomalies_text}
|
| 34 |
+
|
| 35 |
+
Output Format:
|
| 36 |
+
- Insight 1
|
| 37 |
+
- Insight 2
|
| 38 |
+
- Insight 3
|
| 39 |
+
...
|
| 40 |
+
"""
|
| 41 |
+
return generate_text(prompt)
|
| 42 |
+
|
| 43 |
+
def get_followup_questions(overview_text):
|
| 44 |
+
prompt = f"""
|
| 45 |
+
Based on the following dataset summary, suggest 3-5 relevant follow-up questions
|
| 46 |
+
that a data analyst should ask to deeper understand the business context or data quality.
|
| 47 |
+
|
| 48 |
+
Data Summary:
|
| 49 |
+
{overview_text}
|
| 50 |
+
|
| 51 |
+
Output Format:
|
| 52 |
+
1. Question 1
|
| 53 |
+
2. Question 2
|
| 54 |
+
3. Question 3
|
| 55 |
+
...
|
| 56 |
+
"""
|
| 57 |
+
return generate_text(prompt)
|
src/profiling.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
def profile_data(df):
|
| 5 |
+
"""
|
| 6 |
+
Generates a statistical profile of the DataFrame.
|
| 7 |
+
Returns a dictionary containing key metrics.
|
| 8 |
+
"""
|
| 9 |
+
if df is None or df.empty:
|
| 10 |
+
return {}
|
| 11 |
+
|
| 12 |
+
profile = {
|
| 13 |
+
"rows": len(df),
|
| 14 |
+
"columns": len(df.columns),
|
| 15 |
+
"column_names": list(df.columns),
|
| 16 |
+
"missing_cells": df.isnull().sum().sum(),
|
| 17 |
+
"missing_cells_percent": (df.isnull().sum().sum() / df.size) * 100,
|
| 18 |
+
"duplicate_rows": df.duplicated().sum(),
|
| 19 |
+
"duplicate_rows_percent": (df.duplicated().sum() / len(df)) * 100,
|
| 20 |
+
"columns_processing": {},
|
| 21 |
+
"numerical_columns": [],
|
| 22 |
+
"categorical_columns": [],
|
| 23 |
+
"datetime_columns": []
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
for col in df.columns:
|
| 27 |
+
col_type = str(df[col].dtype)
|
| 28 |
+
n_unique = df[col].nunique()
|
| 29 |
+
missing = df[col].isnull().sum()
|
| 30 |
+
|
| 31 |
+
col_profile = {
|
| 32 |
+
"type": col_type,
|
| 33 |
+
"unique": n_unique,
|
| 34 |
+
"missing": missing,
|
| 35 |
+
"missing_percent": (missing / len(df)) * 100
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Classify and compute specific stats
|
| 39 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
| 40 |
+
profile["numerical_columns"].append(col)
|
| 41 |
+
col_profile["mean"] = df[col].mean()
|
| 42 |
+
col_profile["median"] = df[col].median()
|
| 43 |
+
col_profile["std"] = df[col].std()
|
| 44 |
+
col_profile["min"] = df[col].min()
|
| 45 |
+
col_profile["max"] = df[col].max()
|
| 46 |
+
col_profile["zeros"] = (df[col] == 0).sum()
|
| 47 |
+
elif pd.api.types.is_datetime64_any_dtype(df[col]):
|
| 48 |
+
profile["datetime_columns"].append(col)
|
| 49 |
+
col_profile["min_date"] = df[col].min()
|
| 50 |
+
col_profile["max_date"] = df[col].max()
|
| 51 |
+
else:
|
| 52 |
+
profile["categorical_columns"].append(col)
|
| 53 |
+
# Top categories
|
| 54 |
+
try:
|
| 55 |
+
col_profile["top_categories"] = df[col].value_counts().head(5).to_dict()
|
| 56 |
+
except:
|
| 57 |
+
col_profile["top_categories"] = {}
|
| 58 |
+
|
| 59 |
+
profile["columns_processing"][col] = col_profile
|
| 60 |
+
|
| 61 |
+
return profile
|
| 62 |
+
|
| 63 |
+
def get_overview_text(profile):
|
| 64 |
+
"""
|
| 65 |
+
Generates a natural language overview from the profile.
|
| 66 |
+
"""
|
| 67 |
+
if not profile:
|
| 68 |
+
return "No data available."
|
| 69 |
+
|
| 70 |
+
overview = f"""
|
| 71 |
+
### Dataset Overview
|
| 72 |
+
- **Rows:** {profile['rows']:,}
|
| 73 |
+
- **Columns:** {profile['columns']}
|
| 74 |
+
- **Missing Values:** {profile['missing_cells']:,} ({profile['missing_cells_percent']:.2f}%)
|
| 75 |
+
- **Duplicates:** {profile['duplicate_rows']:,} ({profile['duplicate_rows_percent']:.2f}%)
|
| 76 |
+
|
| 77 |
+
#### Column Types
|
| 78 |
+
- **Numerical:** {len(profile['numerical_columns'])} ({', '.join(profile['numerical_columns'][:3])}{'...' if len(profile['numerical_columns']) > 3 else ''})
|
| 79 |
+
- **Categorical:** {len(profile['categorical_columns'])} ({', '.join(profile['categorical_columns'][:3])}{'...' if len(profile['categorical_columns']) > 3 else ''})
|
| 80 |
+
- **Datetime:** {len(profile['datetime_columns'])}
|
| 81 |
+
"""
|
| 82 |
+
return overview
|
src/visualization.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import plotly.express as px
|
| 2 |
+
import plotly.graph_objects as go
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
def generate_charts(df, profile):
|
| 7 |
+
"""
|
| 8 |
+
Generates a set of Plotly charts based on the data profile.
|
| 9 |
+
Returns a Plotly Figure or a list/dict of figures (or a combined subplot for simplicity in Gradio).
|
| 10 |
+
"""
|
| 11 |
+
if df is None or df.empty:
|
| 12 |
+
return None
|
| 13 |
+
|
| 14 |
+
# We will create a few key charts
|
| 15 |
+
figures = []
|
| 16 |
+
|
| 17 |
+
# 1. Correlation Heatmap (Numerical)
|
| 18 |
+
num_cols = profile['numerical_columns']
|
| 19 |
+
if len(num_cols) > 1:
|
| 20 |
+
corr = df[num_cols].corr()
|
| 21 |
+
fig_corr = px.imshow(corr, text_auto=True, aspect="auto", title="Correlation Matrix")
|
| 22 |
+
figures.append(fig_corr)
|
| 23 |
+
|
| 24 |
+
# 2. Distributions (Numerical) - Top 3 interesting ones (highest variance?)
|
| 25 |
+
# For simplicity, just take the first few
|
| 26 |
+
for col in num_cols[:3]:
|
| 27 |
+
fig_hist = px.histogram(df, x=col, title=f"Distribution of {col}", marginal="box")
|
| 28 |
+
figures.append(fig_hist)
|
| 29 |
+
|
| 30 |
+
# 3. Categorical Counts - Top 3
|
| 31 |
+
cat_cols = profile['categorical_columns']
|
| 32 |
+
for col in cat_cols[:3]:
|
| 33 |
+
if df[col].nunique() < 50: # Don't plot high cardinality
|
| 34 |
+
counts = df[col].value_counts().head(10)
|
| 35 |
+
fig_bar = px.bar(x=counts.index, y=counts.values, labels={'x': col, 'y': 'Count'}, title=f"Count of {col}")
|
| 36 |
+
figures.append(fig_bar)
|
| 37 |
+
|
| 38 |
+
# 4. Scatter Plots (if reasonable)
|
| 39 |
+
if len(num_cols) >= 2:
|
| 40 |
+
# Scatter of first two numerical columns
|
| 41 |
+
fig_scat = px.scatter(df, x=num_cols[0], y=num_cols[1], title=f"{num_cols[0]} vs {num_cols[1]}")
|
| 42 |
+
figures.append(fig_scat)
|
| 43 |
+
|
| 44 |
+
# For Gradio's Plot component, it usually expects a single figure object.
|
| 45 |
+
# But we want to show multiple. One way is to use subplots, or return a list and have the UI render multiple plots.
|
| 46 |
+
# The requirement says "Visual Story". Let's try to combine them or just return the "best" one for the main slot,
|
| 47 |
+
# or arguably, we can return a huge subplot.
|
| 48 |
+
# PRO-TIP: We can just return the first figure for now in the main slot,
|
| 49 |
+
# or create a subplot.
|
| 50 |
+
# Let's return the Correlation Matrix as the "Visual Story" hero if available, otherwise a distribution.
|
| 51 |
+
|
| 52 |
+
if figures:
|
| 53 |
+
# Return the first one as the hero
|
| 54 |
+
return figures[0]
|
| 55 |
+
|
| 56 |
+
return None
|
verify_pipeline.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Add the project root to sys.path
|
| 7 |
+
sys.path.append(os.getcwd())
|
| 8 |
+
|
| 9 |
+
from app import analyze_dataset, load_example
|
| 10 |
+
|
| 11 |
+
class MockFile:
|
| 12 |
+
def __init__(self, path):
|
| 13 |
+
self.name = path
|
| 14 |
+
|
| 15 |
+
print("Generating example dataset...")
|
| 16 |
+
example_path = load_example()
|
| 17 |
+
print(f"Example dataset created at: {example_path}")
|
| 18 |
+
|
| 19 |
+
print("Running pipeline...")
|
| 20 |
+
mock_file = MockFile(example_path)
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
results = analyze_dataset(mock_file)
|
| 24 |
+
|
| 25 |
+
# Unpack results to verify types
|
| 26 |
+
overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, questions = results
|
| 27 |
+
|
| 28 |
+
print("Pipeline finished successfully.")
|
| 29 |
+
print(f"Overview MD Length: {len(overview_md)}")
|
| 30 |
+
print(f"Overview DF Shape: {overview_df.shape if hasattr(overview_df, 'shape') else 'None'}")
|
| 31 |
+
print(f"Insights: {insights[:50]}...")
|
| 32 |
+
print(f"Chart Object: {type(chart)}")
|
| 33 |
+
print(f"Anomalies MD Length: {len(anomalies_md)}")
|
| 34 |
+
print(f"Anomalies DF Shape: {anomalies_df.shape if hasattr(anomalies_df, 'shape') else 'None'}")
|
| 35 |
+
print(f"Questions: {questions[:50]}...")
|
| 36 |
+
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"Pipeline Failed: {e}")
|
| 39 |
+
import traceback
|
| 40 |
+
traceback.print_exc()
|
| 41 |
+
|
| 42 |
+
# Cleanup
|
| 43 |
+
if os.path.exists(example_path):
|
| 44 |
+
os.remove(example_path)
|
verify_pipeline_mock.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
from unittest.mock import patch
|
| 6 |
+
|
| 7 |
+
# Add the project root to sys.path
|
| 8 |
+
sys.path.append(os.getcwd())
|
| 9 |
+
|
| 10 |
+
# Mock the LLM module BEFORE importing app
|
| 11 |
+
with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insights, \
|
| 12 |
+
patch('src.llm.get_followup_questions', return_value="Mocked Questions") as mock_questions:
|
| 13 |
+
|
| 14 |
+
from app import analyze_dataset, load_example
|
| 15 |
+
|
| 16 |
+
class MockFile:
|
| 17 |
+
def __init__(self, path):
|
| 18 |
+
self.name = path
|
| 19 |
+
|
| 20 |
+
print("Generating example dataset...")
|
| 21 |
+
example_path = load_example()
|
| 22 |
+
print(f"Example dataset created at: {example_path}")
|
| 23 |
+
|
| 24 |
+
print("Running pipeline with MOCKED LLM...")
|
| 25 |
+
mock_file = MockFile(example_path)
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
results = analyze_dataset(mock_file)
|
| 29 |
+
|
| 30 |
+
# Unpack results to verify types
|
| 31 |
+
overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, questions = results
|
| 32 |
+
|
| 33 |
+
print("Pipeline finished successfully (Mocked LLM).")
|
| 34 |
+
print(f"Overview MD Length: {len(overview_md)}")
|
| 35 |
+
print(f"Overview DF Shape: {overview_df.shape if hasattr(overview_df, 'shape') else 'None'}")
|
| 36 |
+
print(f"Insights: {insights[:50]}...")
|
| 37 |
+
print(f"Chart Object: {type(chart)}")
|
| 38 |
+
print(f"Anomalies MD Length: {len(anomalies_md)}")
|
| 39 |
+
print(f"Anomalies DF Shape: {anomalies_df.shape if hasattr(anomalies_df, 'shape') else 'None'}")
|
| 40 |
+
print(f"Questions: {questions[:50]}...")
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"Pipeline Failed: {e}")
|
| 44 |
+
import traceback
|
| 45 |
+
traceback.print_exc()
|
| 46 |
+
|
| 47 |
+
# Cleanup
|
| 48 |
+
if os.path.exists(example_path):
|
| 49 |
+
os.remove(example_path)
|