salihfurkaan commited on
Commit
bb9980b
·
1 Parent(s): bd3247d

demo files

Browse files
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import os
5
+ from src.ingestion import load_file
6
+ from src.profiling import profile_data, get_overview_text
7
+ from src.cleaning import clean_data
8
+ from src.anomalies import detect_anomalies
9
+ from src.visualization import generate_charts
10
+ from src.llm import get_insights, get_followup_questions
11
+
12
+ # Global state to hold the dataframe for chat (if needed in future)
13
+ # For this stateless demo, we process per request.
14
+
15
+ def analyze_dataset(file_obj):
16
+ if file_obj is None:
17
+ return (
18
+ "## Please upload a file to begin.",
19
+ "",
20
+ None,
21
+ "",
22
+ pd.DataFrame(),
23
+ ""
24
+ )
25
+
26
+ # 1. Ingestion
27
+ df, error = load_file(file_obj)
28
+ if error:
29
+ return f"## Error: {error}", "", None, "", pd.DataFrame(), ""
30
+
31
+ # 2. Profiling & Cleaning
32
+ # flexible cleaning: we verify and clean column names for consistent access
33
+ df_clean, cleaning_log = clean_data(df)
34
+ profile = profile_data(df_clean)
35
+ overview_text = get_overview_text(profile)
36
+
37
+ # 3. Anomalies
38
+ anomalies_df, anomaly_summary = detect_anomalies(df_clean)
39
+
40
+ # 4. Visualization
41
+ chart_figure = generate_charts(df_clean, profile)
42
+
43
+ # 5. LLM Insights & Questions
44
+ # We pass the text summaries to the LLM
45
+ insights = get_insights(overview_text, anomaly_summary)
46
+ questions = get_followup_questions(overview_text)
47
+
48
+ # Format Outputs
49
+ overview_output = f"{overview_text}\n\n**Data Cleaning Log:**\n" + "\n".join([f"- {item}" for item in cleaning_log])
50
+
51
+ return (
52
+ overview_output, # Dataset Overview (Markdown)
53
+ df_clean.head(), # Dataset Overview (DataFrame) matches UI expectation? No, UI has separate MD and DF.
54
+ # Wait, function signature in app.py needs to match outputs.
55
+ # Let's align with the return statement below.
56
+ insights, # Key Insights
57
+ chart_figure, # Visual Story
58
+ f"### Anomaly Detection Report\n{anomaly_summary}", # Anomalies Markdown
59
+ anomalies_df, # Anomalies DataFrame
60
+ questions # Next Steps
61
+ )
62
+
63
+ def load_example():
64
+ # Create a dummy CSV for the user to try
65
+ dummy_data = {
66
+ "Name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Heidi", "Ivan", "Judy"],
67
+ "Age": [25, 30, 35, 40, 22, 28, 45, 32, 29, 27],
68
+ "Salary": [50000, 60000, 75000, 90000, 48000, 52000, 120000, 65000, 58000, 54000],
69
+ "Department": ["HR", "Engineering", "Engineering", "Management", "HR", "Marketing", "Management", "Engineering", "Marketing", "HR"],
70
+ "Performance_Score": [3.5, 4.2, 4.8, 3.9, 3.1, 4.0, 4.5, 4.3, 3.8, 4.1]
71
+ }
72
+ df = pd.DataFrame(dummy_data)
73
+ # Add some anomalies
74
+ df.loc[6, "Salary"] = 1200000 # outlier
75
+
76
+ df.to_csv("example_dataset.csv", index=False)
77
+ return "example_dataset.csv"
78
+
79
+ # Updated process function wrapper to match inputs/outputs
80
+ def process_file_wrapper(file_obj):
81
+ # Returns: overview_md, overview_df, insights_md, charts_plot, anomalies_md, anomalies_df, questions_md
82
+ results = analyze_dataset(file_obj)
83
+ # Results tuple: (overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, questions)
84
+ return results
85
+
86
+ with gr.Blocks(title="Auto Data Analyst", theme=gr.themes.Soft()) as demo:
87
+ gr.Markdown("# 📊 Auto Data Analyst — No Questions Needed")
88
+ gr.Markdown("Upload your structured data (CSV, Excel, JSON, Parquet) and get instant professional insights.")
89
+
90
+ with gr.Row():
91
+ with gr.Column(scale=1):
92
+ file_upload = gr.File(label="Upload Dataset", file_types=[".csv", ".xlsx", ".json", ".parquet"])
93
+ example_btn = gr.Button("Try Example Dataset", variant="secondary")
94
+
95
+ with gr.Column(scale=3):
96
+ with gr.Tabs():
97
+ with gr.TabItem("Dataset Overview"):
98
+ overview_md = gr.Markdown("Please upload a file to see the overview.")
99
+ dataframe_view = gr.Dataframe(interactive=False, label="Data Preview")
100
+
101
+ with gr.TabItem("Key Insights"):
102
+ insights_md = gr.Markdown("Insights will appear here.")
103
+
104
+ with gr.TabItem("Visual Story"):
105
+ charts_plot = gr.Plot(label="Data Visualization")
106
+
107
+ with gr.TabItem("Anomalies & Outliers"):
108
+ anomalies_md = gr.Markdown("Anomaly detection results.")
109
+ anomalies_df_view = gr.Dataframe(interactive=False, label="Detected Anomalies")
110
+
111
+ with gr.TabItem("Next Steps"):
112
+ questions_md = gr.Markdown("Suggested follow-up questions.")
113
+
114
+ # Event wiring
115
+ file_upload.change(
116
+ fn=process_file_wrapper,
117
+ inputs=[file_upload],
118
+ outputs=[overview_md, dataframe_view, insights_md, charts_plot, anomalies_md, anomalies_df_view, questions_md]
119
+ )
120
+
121
+ example_btn.click(
122
+ fn=load_example,
123
+ outputs=[file_upload] # This puts the file into the upload component, triggering the change event?
124
+ # Gradio File component change triggers when value changes.
125
+ # But sometimes programmatic setting requires explicit trigger or just setting value.
126
+ # Simplest is to have the button return the file path to the File component.
127
+ # And chain the analysis?
128
+ # With Gradio, updating the input component usually triggers the event if configured.
129
+ )
130
+
131
+ if __name__ == "__main__":
132
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ scikit-learn
4
+ plotly
5
+ gradio
6
+ huggingface_hub
7
+ openpyxl
8
+ pyarrow
src/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Auto Data Analyst
2
+ # Internal modules
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (160 Bytes). View file
 
src/__pycache__/anomalies.cpython-313.pyc ADDED
Binary file (1.69 kB). View file
 
src/__pycache__/cleaning.cpython-313.pyc ADDED
Binary file (3.09 kB). View file
 
src/__pycache__/ingestion.cpython-313.pyc ADDED
Binary file (1.96 kB). View file
 
src/__pycache__/llm.cpython-313.pyc ADDED
Binary file (2.21 kB). View file
 
src/__pycache__/profiling.cpython-313.pyc ADDED
Binary file (4.43 kB). View file
 
src/__pycache__/visualization.cpython-313.pyc ADDED
Binary file (2.17 kB). View file
 
src/anomalies.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import IsolationForest
3
+ import numpy as np
4
+
5
+ def detect_anomalies(df):
6
+ """
7
+ Detects anomalies in numerical data using Isolation Forest.
8
+ Returns a dataframe of anomalies and a summary.
9
+ """
10
+ if df is None or df.empty:
11
+ return pd.DataFrame(), "No data for anomaly detection."
12
+
13
+ # Select numerical columns
14
+ numeric_df = df.select_dtypes(include=[np.number])
15
+
16
+ if numeric_df.empty:
17
+ return pd.DataFrame(), "No numerical columns found for anomaly detection."
18
+
19
+ # Fill NaNs if any remain (though cleaning should have handled them, strictly necessary for sklearn)
20
+ numeric_df = numeric_df.fillna(numeric_df.median())
21
+
22
+ try:
23
+ # Isolation Forest
24
+ iso = IsolationForest(contamination=0.05, random_state=42)
25
+ preds = iso.fit_predict(numeric_df)
26
+
27
+ # -1 indicates anomaly
28
+ anomalies = df[preds == -1]
29
+
30
+ return anomalies, f"Detected {len(anomalies)} anomalies ({len(anomalies)/len(df):.1%} of data)."
31
+ except Exception as e:
32
+ return pd.DataFrame(), f"Anomaly detection failed: {str(e)}"
src/cleaning.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ def clean_column_names(df):
5
+ """
6
+ Standardizes column names to snake_case.
7
+ """
8
+ df.columns = df.columns.astype(str).str.lower().str.replace(r'[^\w\s]', '', regex=True).str.replace(r'\s+', '_', regex=True)
9
+ return df
10
+
11
+ def clean_data(df):
12
+ """
13
+ Performs basic deterministic cleaning:
14
+ - Standardizes column names
15
+ - Drops empty columns and rows
16
+ - Fills missing values for visualization purposes (optional, or returns a cleaner copy)
17
+ Returns: cleaned_df, cleaning_log
18
+ """
19
+ if df is None or df.empty:
20
+ return df, []
21
+
22
+ log = []
23
+
24
+ # 1. Clean Column Names
25
+ old_cols = list(df.columns)
26
+ df = clean_column_names(df)
27
+ new_cols = list(df.columns)
28
+ if old_cols != new_cols:
29
+ log.append("Standardized column names to snake_case.")
30
+
31
+ # 2. Drop Empty Columns and Rows
32
+ initial_shape = df.shape
33
+ df = df.dropna(how='all', axis=1)
34
+ df = df.dropna(how='all', axis=0)
35
+ final_shape = df.shape
36
+
37
+ if initial_shape != final_shape:
38
+ dropped_cols = initial_shape[1] - final_shape[1]
39
+ dropped_rows = initial_shape[0] - final_shape[0]
40
+ if dropped_cols > 0:
41
+ log.append(f"Dropped {dropped_cols} empty columns.")
42
+ if dropped_rows > 0:
43
+ log.append(f"Dropped {dropped_rows} empty rows.")
44
+
45
+ # 3. Handle Duplicate Rows
46
+ duplicates = df.duplicated().sum()
47
+ if duplicates > 0:
48
+ df = df.drop_duplicates()
49
+ log.append(f"Removed {duplicates} duplicate rows.")
50
+
51
+ # 4. Fill Missing Values (Simple Strategy for Analysis)
52
+ # For numerical: fill with median
53
+ # For categorical: fill with 'Unknown'
54
+ # We will return a COPY for analysis to avoid mutating original data structure too much if the user wants raw
55
+ # But for "Auto Data Analyst", working on cleaned data is usually preferred.
56
+
57
+ # Let's keep it simple and just fill for the returned view, but maybe not aggressively distinct
58
+ for col in df.columns:
59
+ if df[col].isnull().sum() > 0:
60
+ if pd.api.types.is_numeric_dtype(df[col]):
61
+ fill_val = df[col].median()
62
+ df[col] = df[col].fillna(fill_val)
63
+ log.append(f"Filled missing values in '{col}' with median ({fill_val:.2f}).")
64
+ else:
65
+ df[col] = df[col].fillna("Unknown")
66
+ log.append(f"Filled missing values in '{col}' with 'Unknown'.")
67
+
68
+ return df, log
src/ingestion.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+
4
+ def load_file(file_obj):
5
+ """
6
+ Loads a file into a Pandas DataFrame.
7
+ Supports CSV, Excel, JSON, and Parquet.
8
+ Validates that the data is tabular.
9
+ """
10
+ if file_obj is None:
11
+ return None, "No file uploaded."
12
+
13
+ try:
14
+ # file_obj is a file-like object or path content depends on Gradio version and config,
15
+ # but typically file_obj.name gives the path to the temp file
16
+ file_path = file_obj.name
17
+ file_ext = os.path.splitext(file_path)[1].lower()
18
+
19
+ if file_ext == '.csv':
20
+ df = pd.read_csv(file_path)
21
+ elif file_ext in ['.xlsx', '.xls']:
22
+ df = pd.read_excel(file_path)
23
+ elif file_ext == '.json':
24
+ # Try reading as simple records first, then table
25
+ try:
26
+ df = pd.read_json(file_path, orient='records')
27
+ except ValueError:
28
+ try:
29
+ df = pd.read_json(file_path, orient='table')
30
+ except ValueError:
31
+ # Fallback for other json structures if simple enough
32
+ df = pd.read_json(file_path)
33
+ elif file_ext == '.parquet':
34
+ df = pd.read_parquet(file_path)
35
+ else:
36
+ return None, f"Unsupported file format: {file_ext}. Please upload CSV, Excel, JSON, or Parquet."
37
+
38
+ # Validate tabular structure
39
+ if df.empty:
40
+ return None, "The uploaded file is empty."
41
+
42
+ if len(df.columns) < 2:
43
+ # Soft warning or check? A single column is technically tabular but usually not useful for this tool.
44
+ # We'll allow it but might be worth noting.
45
+ pass
46
+
47
+ return df, None
48
+
49
+ except Exception as e:
50
+ return None, f"Error loading file: {str(e)}"
src/llm.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import InferenceClient
2
+ import os
3
+
4
+ # Initialize the client - relies on HF_TOKEN specific in environment variables for Spaces
5
+ # If running locally without a token, this might fail or be rate limited.
6
+ # We'll use a robust default model.
7
+ # Adding timeout to prevent hanging indefinitely
8
+ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta", timeout=30)
9
+
10
+ def generate_text(prompt, max_new_tokens=512):
11
+ try:
12
+ messages = [
13
+ {"role": "system", "content": "You are a senior data analyst. You provide professional, concise, and accurate insights based on data summaries. You do NOT hallucinate numbers."},
14
+ {"role": "user", "content": prompt}
15
+ ]
16
+ response = client.chat_completion(messages, max_tokens=max_new_tokens)
17
+ return response.choices[0].message.content
18
+ except Exception as e:
19
+ # Graceful fallback if LLM is unavailable or times out
20
+ return f"Based on the analysis, please review the charts and data profile. (AI Insights unavailable: {str(e)})"
21
+
22
+ def get_insights(overview_text, anomalies_text):
23
+ prompt = f"""
24
+ Analyze the following dataset summary and anomaly report.
25
+ Generate 3-5 key professional insights.
26
+ Focus on data quality, distribution patterns, and potential issues.
27
+ Do not make up specific values not present in the summary.
28
+
29
+ Data Summary:
30
+ {overview_text}
31
+
32
+ Anomaly Report:
33
+ {anomalies_text}
34
+
35
+ Output Format:
36
+ - Insight 1
37
+ - Insight 2
38
+ - Insight 3
39
+ ...
40
+ """
41
+ return generate_text(prompt)
42
+
43
+ def get_followup_questions(overview_text):
44
+ prompt = f"""
45
+ Based on the following dataset summary, suggest 3-5 relevant follow-up questions
46
+ that a data analyst should ask to deeper understand the business context or data quality.
47
+
48
+ Data Summary:
49
+ {overview_text}
50
+
51
+ Output Format:
52
+ 1. Question 1
53
+ 2. Question 2
54
+ 3. Question 3
55
+ ...
56
+ """
57
+ return generate_text(prompt)
src/profiling.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ def profile_data(df):
5
+ """
6
+ Generates a statistical profile of the DataFrame.
7
+ Returns a dictionary containing key metrics.
8
+ """
9
+ if df is None or df.empty:
10
+ return {}
11
+
12
+ profile = {
13
+ "rows": len(df),
14
+ "columns": len(df.columns),
15
+ "column_names": list(df.columns),
16
+ "missing_cells": df.isnull().sum().sum(),
17
+ "missing_cells_percent": (df.isnull().sum().sum() / df.size) * 100,
18
+ "duplicate_rows": df.duplicated().sum(),
19
+ "duplicate_rows_percent": (df.duplicated().sum() / len(df)) * 100,
20
+ "columns_processing": {},
21
+ "numerical_columns": [],
22
+ "categorical_columns": [],
23
+ "datetime_columns": []
24
+ }
25
+
26
+ for col in df.columns:
27
+ col_type = str(df[col].dtype)
28
+ n_unique = df[col].nunique()
29
+ missing = df[col].isnull().sum()
30
+
31
+ col_profile = {
32
+ "type": col_type,
33
+ "unique": n_unique,
34
+ "missing": missing,
35
+ "missing_percent": (missing / len(df)) * 100
36
+ }
37
+
38
+ # Classify and compute specific stats
39
+ if pd.api.types.is_numeric_dtype(df[col]):
40
+ profile["numerical_columns"].append(col)
41
+ col_profile["mean"] = df[col].mean()
42
+ col_profile["median"] = df[col].median()
43
+ col_profile["std"] = df[col].std()
44
+ col_profile["min"] = df[col].min()
45
+ col_profile["max"] = df[col].max()
46
+ col_profile["zeros"] = (df[col] == 0).sum()
47
+ elif pd.api.types.is_datetime64_any_dtype(df[col]):
48
+ profile["datetime_columns"].append(col)
49
+ col_profile["min_date"] = df[col].min()
50
+ col_profile["max_date"] = df[col].max()
51
+ else:
52
+ profile["categorical_columns"].append(col)
53
+ # Top categories
54
+ try:
55
+ col_profile["top_categories"] = df[col].value_counts().head(5).to_dict()
56
+ except:
57
+ col_profile["top_categories"] = {}
58
+
59
+ profile["columns_processing"][col] = col_profile
60
+
61
+ return profile
62
+
63
+ def get_overview_text(profile):
64
+ """
65
+ Generates a natural language overview from the profile.
66
+ """
67
+ if not profile:
68
+ return "No data available."
69
+
70
+ overview = f"""
71
+ ### Dataset Overview
72
+ - **Rows:** {profile['rows']:,}
73
+ - **Columns:** {profile['columns']}
74
+ - **Missing Values:** {profile['missing_cells']:,} ({profile['missing_cells_percent']:.2f}%)
75
+ - **Duplicates:** {profile['duplicate_rows']:,} ({profile['duplicate_rows_percent']:.2f}%)
76
+
77
+ #### Column Types
78
+ - **Numerical:** {len(profile['numerical_columns'])} ({', '.join(profile['numerical_columns'][:3])}{'...' if len(profile['numerical_columns']) > 3 else ''})
79
+ - **Categorical:** {len(profile['categorical_columns'])} ({', '.join(profile['categorical_columns'][:3])}{'...' if len(profile['categorical_columns']) > 3 else ''})
80
+ - **Datetime:** {len(profile['datetime_columns'])}
81
+ """
82
+ return overview
src/visualization.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.express as px
2
+ import plotly.graph_objects as go
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+ def generate_charts(df, profile):
7
+ """
8
+ Generates a set of Plotly charts based on the data profile.
9
+ Returns a Plotly Figure or a list/dict of figures (or a combined subplot for simplicity in Gradio).
10
+ """
11
+ if df is None or df.empty:
12
+ return None
13
+
14
+ # We will create a few key charts
15
+ figures = []
16
+
17
+ # 1. Correlation Heatmap (Numerical)
18
+ num_cols = profile['numerical_columns']
19
+ if len(num_cols) > 1:
20
+ corr = df[num_cols].corr()
21
+ fig_corr = px.imshow(corr, text_auto=True, aspect="auto", title="Correlation Matrix")
22
+ figures.append(fig_corr)
23
+
24
+ # 2. Distributions (Numerical) - Top 3 interesting ones (highest variance?)
25
+ # For simplicity, just take the first few
26
+ for col in num_cols[:3]:
27
+ fig_hist = px.histogram(df, x=col, title=f"Distribution of {col}", marginal="box")
28
+ figures.append(fig_hist)
29
+
30
+ # 3. Categorical Counts - Top 3
31
+ cat_cols = profile['categorical_columns']
32
+ for col in cat_cols[:3]:
33
+ if df[col].nunique() < 50: # Don't plot high cardinality
34
+ counts = df[col].value_counts().head(10)
35
+ fig_bar = px.bar(x=counts.index, y=counts.values, labels={'x': col, 'y': 'Count'}, title=f"Count of {col}")
36
+ figures.append(fig_bar)
37
+
38
+ # 4. Scatter Plots (if reasonable)
39
+ if len(num_cols) >= 2:
40
+ # Scatter of first two numerical columns
41
+ fig_scat = px.scatter(df, x=num_cols[0], y=num_cols[1], title=f"{num_cols[0]} vs {num_cols[1]}")
42
+ figures.append(fig_scat)
43
+
44
+ # For Gradio's Plot component, it usually expects a single figure object.
45
+ # But we want to show multiple. One way is to use subplots, or return a list and have the UI render multiple plots.
46
+ # The requirement says "Visual Story". Let's try to combine them or just return the "best" one for the main slot,
47
+ # or arguably, we can return a huge subplot.
48
+ # PRO-TIP: We can just return the first figure for now in the main slot,
49
+ # or create a subplot.
50
+ # Let's return the Correlation Matrix as the "Visual Story" hero if available, otherwise a distribution.
51
+
52
+ if figures:
53
+ # Return the first one as the hero
54
+ return figures[0]
55
+
56
+ return None
verify_pipeline.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import sys
4
+ import os
5
+
6
+ # Add the project root to sys.path
7
+ sys.path.append(os.getcwd())
8
+
9
+ from app import analyze_dataset, load_example
10
+
11
+ class MockFile:
12
+ def __init__(self, path):
13
+ self.name = path
14
+
15
+ print("Generating example dataset...")
16
+ example_path = load_example()
17
+ print(f"Example dataset created at: {example_path}")
18
+
19
+ print("Running pipeline...")
20
+ mock_file = MockFile(example_path)
21
+
22
+ try:
23
+ results = analyze_dataset(mock_file)
24
+
25
+ # Unpack results to verify types
26
+ overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, questions = results
27
+
28
+ print("Pipeline finished successfully.")
29
+ print(f"Overview MD Length: {len(overview_md)}")
30
+ print(f"Overview DF Shape: {overview_df.shape if hasattr(overview_df, 'shape') else 'None'}")
31
+ print(f"Insights: {insights[:50]}...")
32
+ print(f"Chart Object: {type(chart)}")
33
+ print(f"Anomalies MD Length: {len(anomalies_md)}")
34
+ print(f"Anomalies DF Shape: {anomalies_df.shape if hasattr(anomalies_df, 'shape') else 'None'}")
35
+ print(f"Questions: {questions[:50]}...")
36
+
37
+ except Exception as e:
38
+ print(f"Pipeline Failed: {e}")
39
+ import traceback
40
+ traceback.print_exc()
41
+
42
+ # Cleanup
43
+ if os.path.exists(example_path):
44
+ os.remove(example_path)
verify_pipeline_mock.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import sys
4
+ import os
5
+ from unittest.mock import patch
6
+
7
+ # Add the project root to sys.path
8
+ sys.path.append(os.getcwd())
9
+
10
+ # Mock the LLM module BEFORE importing app
11
+ with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insights, \
12
+ patch('src.llm.get_followup_questions', return_value="Mocked Questions") as mock_questions:
13
+
14
+ from app import analyze_dataset, load_example
15
+
16
+ class MockFile:
17
+ def __init__(self, path):
18
+ self.name = path
19
+
20
+ print("Generating example dataset...")
21
+ example_path = load_example()
22
+ print(f"Example dataset created at: {example_path}")
23
+
24
+ print("Running pipeline with MOCKED LLM...")
25
+ mock_file = MockFile(example_path)
26
+
27
+ try:
28
+ results = analyze_dataset(mock_file)
29
+
30
+ # Unpack results to verify types
31
+ overview_md, overview_df, insights, chart, anomalies_md, anomalies_df, questions = results
32
+
33
+ print("Pipeline finished successfully (Mocked LLM).")
34
+ print(f"Overview MD Length: {len(overview_md)}")
35
+ print(f"Overview DF Shape: {overview_df.shape if hasattr(overview_df, 'shape') else 'None'}")
36
+ print(f"Insights: {insights[:50]}...")
37
+ print(f"Chart Object: {type(chart)}")
38
+ print(f"Anomalies MD Length: {len(anomalies_md)}")
39
+ print(f"Anomalies DF Shape: {anomalies_df.shape if hasattr(anomalies_df, 'shape') else 'None'}")
40
+ print(f"Questions: {questions[:50]}...")
41
+
42
+ except Exception as e:
43
+ print(f"Pipeline Failed: {e}")
44
+ import traceback
45
+ traceback.print_exc()
46
+
47
+ # Cleanup
48
+ if os.path.exists(example_path):
49
+ os.remove(example_path)