AIDataAgentProjectFinal

Paused

App Files Files Community

pavanmutha commited on Apr 18, 2025

Commit

cb1d344

verified ·

1 Parent(s): f2d52e9

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -203

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import re
 import gradio as gr
@@ -24,45 +25,22 @@ from sklearn.preprocessing import LabelEncoder
 from datetime import datetime
 from PIL import Image
 # Authenticate with Hugging Face
 hf_token = os.getenv("HF_TOKEN")
 login(token=hf_token)
 # SmolAgent initialization
 model = HfApiModel("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
 # Globals
 df_global = None
 target_column_global = None
-data_summary_global = None  # ⬅️ Added for summarized data
-def clean_data(df):
-    df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
-    for col in df.select_dtypes(include='object').columns:
-        df[col] = df[col].astype(str)
-        df[col] = LabelEncoder().fit_transform(df[col])
-    df = df.fillna(df.mean(numeric_only=True))
-    return df
-def summarize_data(df: pd.DataFrame, max_cols: int = 10, max_rows: int = 5) -> str:
-    summary = []
-    summary.append(f"Dataset shape: {df.shape}")
-    summary.append("\nColumn types:\n" + str(df.dtypes))
-    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
-    summary.append("\nMissing values:\n" + str(df.isnull().sum()))
-    if num_cols:
-        summary.append("\nNumerical summary:\n" + str(df[num_cols].describe().T.head(max_rows)))
-    if cat_cols:
-        summary.append("\nCategorical value counts (top categories):")
-        for col in cat_cols[:max_cols]:
-            summary.append(f"\nColumn: {col}\n{df[col].value_counts().head(max_rows)}")
-    return "\n".join(summary)
 def upload_file(file):
     global df_global, data_summary_global
     if file is None:
@@ -80,6 +58,124 @@ def set_target_column(col_name):
     target_column_global = col_name
     return f"✅ Target column set to: {col_name}"
 def format_analysis_report(raw_output, visuals):
@@ -165,182 +261,6 @@ def format_insights(insights, visuals):
-### ✅ 2. Add a pre-check fallback for non-compliant agent outputs
-def extract_json_from_codeagent_output(raw_output):
-    import re, json, ast
-    try:
-        # Extract code blocks from ```python ... ```
-        code_blocks = re.findall(r"```(?:py|python)?\n(.*?)```", raw_output, re.DOTALL)
-        for block in code_blocks:
-            # Try extracting from print(json.dumps({...}))
-            json_match = re.search(
-                r"print\(\s*json\.dumps\(\s*(\{[\s\S]*?\})\s*\)\s*\)",
-                block,
-                re.DOTALL
-            ) or re.search(
-                r"json\.dumps\(\s*(\{[\s\S]*?\})\s*\)",
-                block,
-                re.DOTALL
-            )
-            if json_match:
-                return json.loads(json_match.group(1))
-            # Try extracting from: result = {...}
-            result_match = re.search(
-                r"result\s*=\s*(\{[\s\S]*?\})",
-                block,
-                re.DOTALL
-            )
-            if result_match:
-                raw_dict = result_match.group(1)
-                try:
-                    return json.loads(raw_dict)  # Try strict JSON
-                except json.JSONDecodeError:
-                    return ast.literal_eval(raw_dict)  # Try Python dict parsing
-        # Final fallback: look for any dict-like thing in entire output
-        fallback_match = re.search(r"\{[\s\S]+\}", raw_output)
-        if fallback_match:
-            raw_dict = fallback_match.group(0)
-            try:
-                return json.loads(raw_dict)
-            except json.JSONDecodeError:
-                return ast.literal_eval(raw_dict)
-    except Exception as e:
-        print(f"extract_json_from_codeagent_output() failed: {e}")
-        return None
-def analyze_data(csv_file=None, additional_notes="", use_summary=True):
-    try:
-        start_time = time.time()
-        process = psutil.Process(os.getpid())
-        initial_memory = process.memory_info().rss / 1024 ** 2
-        # Clean up and prepare directories
-        if os.path.exists('./figures'):
-            shutil.rmtree('./figures')
-        os.makedirs('./figures', exist_ok=True)
-        # Initialize WandB
-        wandb.login(key=os.environ.get('WANDB_API_KEY'))
-        run = wandb.init(project="huggingface-data-analysis", config={
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "additional_notes": additional_notes,
-            "source_file": csv_file.name if csv_file else "summarized_input"
-        })
-        # Initialize Code Agent
-        agent = CodeAgent(
-            tools=[],
-            model=model,
-            additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"]
-        )
-        # Choose prompt content
-        if use_summary and data_summary_global:
-            input_data = data_summary_global
-            data_instruction = """
-You are analyzing summarized dataset information from a CSV file. Your job is to:
-1. Interpret the summary content as if it was produced from a real dataset.
-2. Derive at least 5 high-level insights based on column types, distributions, missing values, etc.
-3. Imagine or mock visualizations and describe what they would show. Use synthetic data simulation with numpy/pandas if needed.
-4. Save plots to './figures/' using matplotlib or seaborn.
-Always respond in the structured dictionary format below.
-"""
-        else:
-            # Fall back to full file input
-            input_data = None  # You load file within the agent
-            data_instruction = """
-You are a helpful data analysis agent. Please follow these very strict instructions and formatting:
-1. Load the data from the provided `source_file`.
-2. FIRST analyze the data structure (column names and types)
-3. THEN generate visualizations using EXISTING columns with least 5 visualizations and 5 insights.
-4. Save all plots to `./figures/` as PNGs using matplotlib or seaborn.
-5. DO NOT use open() or print() statements.
-6. Use only authorized imports: `pandas`, `numpy`, `matplotlib.pyplot`, `seaborn`, `json`.
-7. DO NOT return any explanations, thoughts, or narration outside the final output block.
-8. DO NOT use `...` in any dictionary values, arrays, or code blocks.
-9. Use empty lists like [] or strings like "N/A" instead.
-10.Respond only with a JSON-serializable dictionary in Python syntax. Do not include any thoughts, comments, or explanation.
-11. Any logging or warnings must be disabled or redirected; the only stdout must be the single print(json.dumps(result)) call.
-12. FINALLY return ONLY this exact format:
-```python
-import json
-result = {
-    "observations": {
-        "numeric_columns": [...],
-        "categorical_columns": [...],
-        "data_issues": "..."
-    },
-    "insights": [
-        {"category": "Insight A", "insight": "Description of insight A"},
-        {"category": "Insight B", "insight": "Description of insight B"}
-    ]
-}
-print(json.dumps(result))
-```<end_code>
-Be concise and avoid any narrative outside this final dictionary.
-Never use unauthorized imports (only pandas, numpy, matplotlib, seaborn are allowed)
-"""
-        # Run agent with either summarized content or CSV
-        analysis_result = agent.run(data_instruction, additional_args={
-            "additional_notes": additional_notes,
-            "source_file": csv_file.name if csv_file and not use_summary else None,
-            "data_summary": input_data if use_summary else None
-        })
-        # Performance metrics
-        execution_time = time.time() - start_time
-        final_memory = process.memory_info().rss / 1024 ** 2
-        memory_usage = final_memory - initial_memory
-        wandb.log({
-            "execution_time_sec": execution_time,
-            "memory_usage_mb": memory_usage
-        })
-        # Collect visualizations
-        visuals = sorted([
-            os.path.join('./figures', f) for f in os.listdir('./figures')
-            if f.endswith(('.png', '.jpg', '.jpeg'))
-        ])
-        for viz in visuals:
-            wandb.log({os.path.basename(viz): wandb.Image(viz)})
-        run.finish()
-        print("DEBUG - Raw agent output:", analysis_result[:500] + "...")
-        with open("agent_output.txt", "w") as f:
-            f.write(str(analysis_result))
-        parsed_result = extract_json_from_codeagent_output(analysis_result)
-        if parsed_result:
-            return format_analysis_report(parsed_result, visuals)
-        else:
-            error_msg = f"Failed to parse agent output. Showing raw response:\n{str(analysis_result)[:2000]}"
-            print(error_msg)
-            return f"<pre>{error_msg}</pre>", visuals
-    except Exception as e:
-        error_msg = f"Analysis failed with error: {str(e)}"
-        print(error_msg)
-        return f"<pre>{error_msg}</pre>", []
 def compare_models():
     import seaborn as sns

+# Initialization and Imports
 import os
 import re
 import gradio as gr
 from datetime import datetime
 from PIL import Image
 # Authenticate with Hugging Face
 hf_token = os.getenv("HF_TOKEN")
 login(token=hf_token)
 # SmolAgent initialization
 model = HfApiModel("mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
 # Globals
 df_global = None
 target_column_global = None
+#File Upload and Cleanup
 def upload_file(file):
     global df_global, data_summary_global
     if file is None:
     target_column_global = col_name
     return f"✅ Target column set to: {col_name}"
+def clean_data(df):
+    df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
+    for col in df.select_dtypes(include='object').columns:
+        df[col] = df[col].astype(str)
+        df[col] = LabelEncoder().fit_transform(df[col])
+    df = df.fillna(df.mean(numeric_only=True))
+    return df
+# Add a extraction of JSON if CodeAgent Output is not in format
+def extract_json_from_codeagent_output(raw_output):
+    import re, json, ast
+    try:
+        # Extract code blocks from ```python ... ```
+        code_blocks = re.findall(r"```(?:py|python)?\n(.*?)```", raw_output, re.DOTALL)
+        for block in code_blocks:
+            # Try extracting from print(json.dumps({...}))
+            json_match = re.search(
+                r"print\(\s*json\.dumps\(\s*(\{[\s\S]*?\})\s*\)\s*\)",
+                block,
+                re.DOTALL
+            ) or re.search(
+                r"json\.dumps\(\s*(\{[\s\S]*?\})\s*\)",
+                block,
+                re.DOTALL
+            )
+            if json_match:
+                return json.loads(json_match.group(1))
+            # Try extracting from: result = {...}
+            result_match = re.search(
+                r"result\s*=\s*(\{[\s\S]*?\})",
+                block,
+                re.DOTALL
+            )
+            if result_match:
+                raw_dict = result_match.group(1)
+                try:
+                    return json.loads(raw_dict)  # Try strict JSON
+                except json.JSONDecodeError:
+                    return ast.literal_eval(raw_dict)  # Try Python dict parsing
+        # Final fallback: look for any dict-like thing in entire output
+        fallback_match = re.search(r"\{[\s\S]+\}", raw_output)
+        if fallback_match:
+            raw_dict = fallback_match.group(0)
+            try:
+                return json.loads(raw_dict)
+            except json.JSONDecodeError:
+                return ast.literal_eval(raw_dict)
+    except Exception as e:
+        print(f"extract_json_from_codeagent_output() failed: {e}")
+        return None
+# Data Analysis Function with CodeAgent
+def analyze_data(csv_file, additional_notes=""):
+    start_time = time.time()
+    process = psutil.Process(os.getpid())
+    initial_memory = process.memory_info().rss / 1024 ** 2
+    if os.path.exists('./figures'):
+        shutil.rmtree('./figures')
+    os.makedirs('./figures', exist_ok=True)
+    wandb.login(key=os.environ.get('WANDB_API_KEY'))
+    run = wandb.init(project="huggingface-data-analysis", config={
+        "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+        "additional_notes": additional_notes,
+        "source_file": csv_file.name if csv_file else None
+    })
+    agent = CodeAgent(tools=[], model=model, additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "sklearn", "json"])
+    analysis_result = agent.run("""
+        You are a helpful data analysis agent. Just return insight information and visualization.
+        Load the data that is passed.do not create your own.
+        Automatically detect numeric columns and names.
+        2. 5 data visualizations
+        3. at least 5 insights from data
+        5. Generate publication-quality visualizations and save to './figures/'.
+        Do not use 'open()' or write to files. Just return variables and plots.
+        The dictionary should have the following structure:
+        {
+            'observations': {
+                'observation_1_key': 'observation_1_value',
+                'observation_2_key': 'observation_2_value',
+                ...
+            },
+            'insights': {
+                'insight_1_key': 'insight_1_value',
+                'insight_2_key': 'insight_2_value',
+                ...
+            }
+        }
+    """, additional_args={"additional_notes": additional_notes, "source_file": csv_file})
+    execution_time = time.time() - start_time
+    final_memory = process.memory_info().rss / 1024 ** 2
+    memory_usage = final_memory - initial_memory
+    wandb.log({"execution_time_sec": execution_time, "memory_usage_mb": memory_usage})
+    visuals = [os.path.join('./figures', f) for f in os.listdir('./figures') if f.endswith(('.png', '.jpg', '.jpeg'))]
+    for viz in visuals:
+        wandb.log({os.path.basename(viz): wandb.Image(viz)})
+    run.finish()
+    return format_analysis_report(analysis_result, visuals)
 def format_analysis_report(raw_output, visuals):
 def compare_models():
     import seaborn as sns