AIDataAgentProjectFinal

Paused

App Files Files Community

pavanmutha commited on Apr 18, 2025

Commit

f2d52e9

verified ·

1 Parent(s): 8115491

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -19

app.py CHANGED Viewed

@@ -31,8 +31,10 @@ login(token=hf_token)
 # SmolAgent initialization
 model = HfApiModel("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
 df_global = None
 target_column_global = None
 def clean_data(df):
     df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
@@ -42,24 +44,44 @@ def clean_data(df):
     df = df.fillna(df.mean(numeric_only=True))
     return df
 def upload_file(file):
-    global df_global
     if file is None:
         return pd.DataFrame({"Error": ["No file uploaded."]}), gr.update(choices=[])
     ext = os.path.splitext(file.name)[-1]
     df = pd.read_csv(file.name) if ext == ".csv" else pd.read_excel(file.name)
     df = clean_data(df)
     df_global = df
     return df.head(), gr.update(choices=df.columns.tolist())
 def set_target_column(col_name):
     global target_column_global
     target_column_global = col_name
     return f"✅ Target column set to: {col_name}"
 def format_analysis_report(raw_output, visuals):
     import json
@@ -195,7 +217,7 @@ def extract_json_from_codeagent_output(raw_output):
-def analyze_data(csv_file, additional_notes=""):
     try:
         start_time = time.time()
         process = psutil.Process(os.getpid())
@@ -211,7 +233,7 @@ def analyze_data(csv_file, additional_notes=""):
         run = wandb.init(project="huggingface-data-analysis", config={
             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
             "additional_notes": additional_notes,
-            "source_file": csv_file.name if csv_file else None
         })
         # Initialize Code Agent
@@ -221,8 +243,23 @@ def analyze_data(csv_file, additional_notes=""):
             additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
         )
-        # Enhanced prompt with strict formatting requirements
-        prompt = """
 You are a helpful data analysis agent. Please follow these very strict instructions and formatting:
 1. Load the data from the provided `source_file`.
@@ -259,10 +296,11 @@ Be concise and avoid any narrative outside this final dictionary.
 Never use unauthorized imports (only pandas, numpy, matplotlib, seaborn are allowed)
 """
-        # Run the agent
-        analysis_result = agent.run(prompt, additional_args={
             "additional_notes": additional_notes,
-            "source_file": csv_file.name if csv_file else None
         })
         # Performance metrics
@@ -281,25 +319,19 @@ Never use unauthorized imports (only pandas, numpy, matplotlib, seaborn are allo
             if f.endswith(('.png', '.jpg', '.jpeg'))
         ])
-        # Log visuals to WandB
         for viz in visuals:
             wandb.log({os.path.basename(viz): wandb.Image(viz)})
         run.finish()
         print("DEBUG - Raw agent output:", analysis_result[:500] + "...")
-        print("Columns in data:", df_global.columns.tolist())
-        print("Data types:", df_global.dtypes)
         with open("agent_output.txt", "w") as f:
             f.write(str(analysis_result))
-        # Parse the agent output
-        parsed_result = extract_json_from_codeagent_output(analysis_result)
-        print(f"DEBUG - Parsed result: {parsed_result}")  # Debug output
         if parsed_result:
             return format_analysis_report(parsed_result, visuals)
         else:
-            # Fallback to showing raw output if parsing fails
             error_msg = f"Failed to parse agent output. Showing raw response:\n{str(analysis_result)[:2000]}"
             print(error_msg)
             return f"<pre>{error_msg}</pre>", visuals
@@ -309,7 +341,7 @@ Never use unauthorized imports (only pandas, numpy, matplotlib, seaborn are allo
         print(error_msg)
         return f"<pre>{error_msg}</pre>", []
 def compare_models():
     import seaborn as sns
     from sklearn.model_selection import cross_val_predict

 # SmolAgent initialization
 model = HfApiModel("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
+# Globals
 df_global = None
 target_column_global = None
+data_summary_global = None  # ⬅️ Added for summarized data
 def clean_data(df):
     df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
     df = df.fillna(df.mean(numeric_only=True))
     return df
+def summarize_data(df: pd.DataFrame, max_cols: int = 10, max_rows: int = 5) -> str:
+    summary = []
+    summary.append(f"Dataset shape: {df.shape}")
+    summary.append("\nColumn types:\n" + str(df.dtypes))
+    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
+    summary.append("\nMissing values:\n" + str(df.isnull().sum()))
+    if num_cols:
+        summary.append("\nNumerical summary:\n" + str(df[num_cols].describe().T.head(max_rows)))
+    if cat_cols:
+        summary.append("\nCategorical value counts (top categories):")
+        for col in cat_cols[:max_cols]:
+            summary.append(f"\nColumn: {col}\n{df[col].value_counts().head(max_rows)}")
+    return "\n".join(summary)
 def upload_file(file):
+    global df_global, data_summary_global
     if file is None:
         return pd.DataFrame({"Error": ["No file uploaded."]}), gr.update(choices=[])
     ext = os.path.splitext(file.name)[-1]
     df = pd.read_csv(file.name) if ext == ".csv" else pd.read_excel(file.name)
     df = clean_data(df)
     df_global = df
+    data_summary_global = summarize_data(df)  # ⬅️ Summarize here
     return df.head(), gr.update(choices=df.columns.tolist())
 def set_target_column(col_name):
     global target_column_global
     target_column_global = col_name
     return f"✅ Target column set to: {col_name}"
 def format_analysis_report(raw_output, visuals):
     import json
+def analyze_data(csv_file=None, additional_notes="", use_summary=True):
     try:
         start_time = time.time()
         process = psutil.Process(os.getpid())
         run = wandb.init(project="huggingface-data-analysis", config={
             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
             "additional_notes": additional_notes,
+            "source_file": csv_file.name if csv_file else "summarized_input"
         })
         # Initialize Code Agent
             additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
         )
+        # Choose prompt content
+        if use_summary and data_summary_global:
+            input_data = data_summary_global
+            data_instruction = """
+You are analyzing summarized dataset information from a CSV file. Your job is to:
+1. Interpret the summary content as if it was produced from a real dataset.
+2. Derive at least 5 high-level insights based on column types, distributions, missing values, etc.
+3. Imagine or mock visualizations and describe what they would show. Use synthetic data simulation with numpy/pandas if needed.
+4. Save plots to './figures/' using matplotlib or seaborn.
+Always respond in the structured dictionary format below.
+"""
+        else:
+            # Fall back to full file input
+            input_data = None  # You load file within the agent
+            data_instruction = """
 You are a helpful data analysis agent. Please follow these very strict instructions and formatting:
 1. Load the data from the provided `source_file`.
 Never use unauthorized imports (only pandas, numpy, matplotlib, seaborn are allowed)
 """
+        # Run agent with either summarized content or CSV
+        analysis_result = agent.run(data_instruction, additional_args={
             "additional_notes": additional_notes,
+            "source_file": csv_file.name if csv_file and not use_summary else None,
+            "data_summary": input_data if use_summary else None
         })
         # Performance metrics
             if f.endswith(('.png', '.jpg', '.jpeg'))
         ])
         for viz in visuals:
             wandb.log({os.path.basename(viz): wandb.Image(viz)})
         run.finish()
         print("DEBUG - Raw agent output:", analysis_result[:500] + "...")
         with open("agent_output.txt", "w") as f:
             f.write(str(analysis_result))
+        parsed_result = extract_json_from_codeagent_output(analysis_result)
         if parsed_result:
             return format_analysis_report(parsed_result, visuals)
         else:
             error_msg = f"Failed to parse agent output. Showing raw response:\n{str(analysis_result)[:2000]}"
             print(error_msg)
             return f"<pre>{error_msg}</pre>", visuals
         print(error_msg)
         return f"<pre>{error_msg}</pre>", []
 def compare_models():
     import seaborn as sns
     from sklearn.model_selection import cross_val_predict