AIDataAgentProjectFinal

Paused

App Files Files Community

pavanmutha commited on Apr 18, 2025

Commit

fcdbea4

verified ·

1 Parent(s): 89c639a

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -24

app.py CHANGED Viewed

@@ -58,30 +58,40 @@ def set_target_column(col_name):
     return f"✅ Target column set to: {col_name}"
 def clean_data(df):
-    # Drop rows and columns where all values are NaN
     df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
-    # Convert columns with object (string) data type to strings
-    for col in df.select_dtypes(include='object').columns:
-        df[col] = df[col].astype(str)
-    # Ensure that the 'Amount' column is treated as a string before using str accessor
-    if 'Amount' in df.columns:
-        df['Amount'] = df['Amount'].astype(str).str.replace(',', '').str.replace('$', '').str.strip()
-    # Convert categorical columns to numeric using LabelEncoder
     for col in df.select_dtypes(include='object').columns:
-        if col != 'Amount':  # Skip 'Amount' column as it was already cleaned
             df[col] = LabelEncoder().fit_transform(df[col])
-    # Fill missing values in numeric columns with the mean of each column
     df = df.fillna(df.mean(numeric_only=True))
     return df
 # Add a extraction of JSON if CodeAgent Output is not in format
 def extract_json_from_codeagent_output(raw_output):
@@ -114,11 +124,25 @@ def extract_json_from_codeagent_output(raw_output):
     # Return an error if JSON extraction fails
     return {"error": "Failed to extract structured JSON"}
 def analyze_data(csv_file, additional_notes=""):
     start_time = time.time()
     process = psutil.Process(os.getpid())
     initial_memory = process.memory_info().rss / 1024 ** 2
     # Clear or create figures folder
     if os.path.exists('./figures'):
         shutil.rmtree('./figures')
@@ -129,16 +153,17 @@ def analyze_data(csv_file, additional_notes=""):
     run = wandb.init(project="huggingface-data-analysis", config={
         "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
         "additional_notes": additional_notes,
-        "source_file": csv_file.name if csv_file else None
     })
     agent = CodeAgent(
         tools=[],
         model=model,
         additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
     )
-    # Run the CodeAgent
     raw_output = agent.run("""
         You are a data analysis agent. Follow these instructions EXACTLY:
         1. Load the data from the given `source_file` ONLY. DO NOT create your OWN DATA.
@@ -146,7 +171,7 @@ def analyze_data(csv_file, additional_notes=""):
         3. Save all figures to `./figures` as PNG using matplotlib or seaborn.
         4. Use only authorized imports: `pandas`, `numpy`, `matplotlib.pyplot`, `seaborn`, `json`.
         5. DO NOT return any explanations, thoughts, or narration outside the final JSON block
-        6. Run only 5 iteration and return output quickly.
         7. Output ONLY the following JSON code block format, exactly:
         {
             'observations': {
@@ -158,14 +183,14 @@ def analyze_data(csv_file, additional_notes=""):
                 ...
             }
         }
-    """, additional_args={"additional_notes": additional_notes, "source_file": csv_file})
-    # Parse agent output
     parsed_result = extract_json_from_codeagent_output(raw_output) or {
         "error": "Failed to extract structured JSON"
     }
-    # Record execution time and memory usage
     execution_time = time.time() - start_time
     final_memory = process.memory_info().rss / 1024 ** 2
     memory_usage = final_memory - initial_memory
@@ -175,14 +200,14 @@ def analyze_data(csv_file, additional_notes=""):
         "memory_usage_mb": round(memory_usage, 2)
     })
-    # Collect generated visualizations
     visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
     for viz in visuals:
         wandb.log({os.path.basename(viz): wandb.Image(viz)})
     run.finish()
-    # Generate summary HTML
     summary_html = "<h3>📊 Data Analysis Summary</h3>"
     if "observations" in parsed_result:
         summary_html += "<h4>🔍 Observations</h4><ul>" + "".join(
@@ -195,11 +220,11 @@ def analyze_data(csv_file, additional_notes=""):
     if "error" in parsed_result:
         summary_html += f"<p style='color:red'><b>Error:</b> {parsed_result['error']}</p>"
-    # Return summary HTML and visual paths for gr.HTML + gr.Gallery
     return summary_html, visuals
 def format_analysis_report(raw_output, visuals):
     import json

     return f"✅ Target column set to: {col_name}"
 def clean_data(df):
+    from sklearn.preprocessing import LabelEncoder
+    import numpy as np
+    # Drop completely empty rows/columns
     df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
+    # Sanitize 'Amount' or similar money/number-looking columns
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            # Attempt cleaning for common currency/number strings
+            try:
+                cleaned = df[col].str.replace(r'[$,]', '', regex=True).str.strip()
+                df[col] = pd.to_numeric(cleaned, errors='ignore')  # Keep original if conversion fails
+            except Exception:
+                pass
+    # Encode any remaining object-type columns
     for col in df.select_dtypes(include='object').columns:
+        try:
+            df[col] = df[col].astype(str)
             df[col] = LabelEncoder().fit_transform(df[col])
+        except Exception:
+            pass
+    # Fill remaining NaNs
     df = df.fillna(df.mean(numeric_only=True))
     return df
 # Add a extraction of JSON if CodeAgent Output is not in format
 def extract_json_from_codeagent_output(raw_output):
     # Return an error if JSON extraction fails
     return {"error": "Failed to extract structured JSON"}
+import pandas as pd
+import tempfile
 def analyze_data(csv_file, additional_notes=""):
     start_time = time.time()
     process = psutil.Process(os.getpid())
     initial_memory = process.memory_info().rss / 1024 ** 2
+    # Load and clean the data BEFORE passing to the agent
+    try:
+        df = pd.read_csv(csv_file)
+        df = clean_data(df)
+    except Exception as e:
+        return f"<p style='color:red'><b>Error loading or cleaning CSV:</b> {e}</p>", []
+    # Save cleaned data to a temporary file
+    tmp_cleaned = tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode='w')
+    df.to_csv(tmp_cleaned.name, index=False)
     # Clear or create figures folder
     if os.path.exists('./figures'):
         shutil.rmtree('./figures')
     run = wandb.init(project="huggingface-data-analysis", config={
         "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
         "additional_notes": additional_notes,
+        "source_file": tmp_cleaned.name
     })
+    # Initialize agent
     agent = CodeAgent(
         tools=[],
         model=model,
         additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
     )
+    # Run the agent on the cleaned file
     raw_output = agent.run("""
         You are a data analysis agent. Follow these instructions EXACTLY:
         1. Load the data from the given `source_file` ONLY. DO NOT create your OWN DATA.
         3. Save all figures to `./figures` as PNG using matplotlib or seaborn.
         4. Use only authorized imports: `pandas`, `numpy`, `matplotlib.pyplot`, `seaborn`, `json`.
         5. DO NOT return any explanations, thoughts, or narration outside the final JSON block
+        6. Run agent efficiently and remove repetitive task and complete in less than 40 seconds.
         7. Output ONLY the following JSON code block format, exactly:
         {
             'observations': {
                 ...
             }
         }
+    """, additional_args={"additional_notes": additional_notes, "source_file": tmp_cleaned})
+    # Parse output
     parsed_result = extract_json_from_codeagent_output(raw_output) or {
         "error": "Failed to extract structured JSON"
     }
+    # Log execution stats
     execution_time = time.time() - start_time
     final_memory = process.memory_info().rss / 1024 ** 2
     memory_usage = final_memory - initial_memory
         "memory_usage_mb": round(memory_usage, 2)
     })
+    # Upload any figures
     visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
     for viz in visuals:
         wandb.log({os.path.basename(viz): wandb.Image(viz)})
     run.finish()
+    # HTML Summary
     summary_html = "<h3>📊 Data Analysis Summary</h3>"
     if "observations" in parsed_result:
         summary_html += "<h4>🔍 Observations</h4><ul>" + "".join(
     if "error" in parsed_result:
         summary_html += f"<p style='color:red'><b>Error:</b> {parsed_result['error']}</p>"
     return summary_html, visuals
 def format_analysis_report(raw_output, visuals):
     import json