Spaces:

Sid9797
/

self-correcting-data-validation-agent

Sleeping

App Files Files Community

Siddhesh Patil commited on Feb 14

Commit

b67668b

0 Parent(s):

Initial commit - Self-Correcting Data Validation Agent

Browse files

Files changed (32) hide show

.gitignore +6 -0
README.md +38 -0
app.py +236 -0
employee_dataset_50rows.csv +51 -0
requirements.txt +8 -0
src/agent/graph.py +225 -0
src/agent/offline_runner.py +27 -0
src/agent/schemas.py +45 -0
src/core/cleaning.py +244 -0
src/core/query.py +155 -0
src/core/security.py +19 -0
src/eval/benchmarks.json +105 -0
src/eval/run_agent_suite.py +119 -0
src/eval/run_eval.py +113 -0
src/tests/test_query.py +11 -0
src/tests_agent/test_offline_runner.py +30 -0
src/tests_agent/tests_schema.py +22 -0
test_inputs/case01_simple.txt +7 -0
test_inputs/case02_word_numbers.txt +7 -0
test_inputs/case03_missing_fields.txt +4 -0
test_inputs/case04_bad_performance.txt +7 -0
test_inputs/case05_weird_date.txt +7 -0
test_inputs/case06_random_noise.txt +9 -0
test_inputs/case07_duplicate_fields.txt +7 -0
test_inputs/case08_two_records.txt +15 -0
test_inputs/case09_no_ids.txt +5 -0
test_inputs/case10_conflicting_values.txt +7 -0
test_inputs/case11_unstructured_paragraph.txt +3 -0
test_inputs/case12_partial_sentences.txt +1 -0
test_inputs/case13_garbage_mixed.txt +11 -0
test_inputs/case14_large_block.txt +5 -0
test_inputs/case15_extreme_noise.txt +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.venv/
+__pycache__/
+.env
+.env.*
+*.pyc
+.DS_Store

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# AI Data Validation Agent (Industry-ready)
+A Streamlit app that:
+- Cleans & normalizes messy HR-like tabular data (emails, dates, ages, salaries, departments)
+- Produces an **audit report** of what was fixed
+- Answers natural-language questions **deterministically** using pandas (not "LLM guesses")
+- Uses an LLM only to convert the user's question into a small JSON query spec + to explain results
+## Run locally
+```bash
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+streamlit run app.py
+```
+## Environment variables
+Set your OpenAI key in **Streamlit sidebar** or via env var:
+```bash
+export OPENAI_API_KEY="..."
+```
+## Evaluation (accuracy benchmarks)
+Run deterministic benchmarks (no LLM required):
+```bash
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+python -m src.eval.run_eval --csv your_data.csv
+```
+Optional end-to-end LLM evaluation:
+- Set `OPENAI_API_KEY`, and change a benchmark case `mode` to `"llm"` in `src/eval/benchmarks.json`.
+- Then re-run the command above.

app.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import os
+import json
+import streamlit as st
+import pandas as pd
+from src.agent.graph import run_agent
+from src.core.cleaning import clean_dataframe
+from src.core.query import plan_query_with_llm, execute_query, summarize_results_with_llm
+from src.core.security import basic_injection_check
+st.set_page_config(page_title="AI Data Validation Agent (Industry-ready)", page_icon="🤖", layout="wide")
+st.title("🤖 AI Data Validation Agent (Industry-ready)")
+st.caption("Deterministic pandas answers + LLM planning/summarization (no LLM guessing).")
+with st.sidebar:
+    st.header("Configuration")
+    api_key = st.text_input("OpenAI API Key", value=os.getenv("OPENAI_API_KEY",""), type="password")
+    model = st.selectbox("Model", ["gpt-4.1-mini","gpt-4.1","gpt-4o-mini","gpt-4o"], index=0)
+    st.divider()
+    uploaded = st.file_uploader("Upload CSV", type=["csv"])
+    st.divider()
+    st.markdown("**Tip:** Clean the data first, then ask questions.")
+if "df_raw" not in st.session_state:
+    st.session_state.df_raw = None
+if "df_clean" not in st.session_state:
+    st.session_state.df_clean = None
+if "clean_report" not in st.session_state:
+    st.session_state.clean_report = None
+tab1, tab2, tab3 = st.tabs(["📄 Data cleaning", "💬 Ask questions", "🧠 Self-correcting agent"])
+with tab1:
+    st.subheader("1) Upload & clean")
+    if uploaded is None:
+        st.info("Upload a CSV from the sidebar.")
+    else:
+        df = pd.read_csv(uploaded)
+        st.session_state.df_raw = df
+        st.write("Raw preview")
+        st.dataframe(df.head(20), use_container_width=True)
+        if st.button("Clean & normalize", type="primary"):
+            dfc, report = clean_dataframe(df)
+            st.session_state.df_clean = dfc
+            st.session_state.clean_report = report
+        if st.session_state.df_clean is not None:
+            st.success("Cleaned dataset ready ✅")
+            report = st.session_state.clean_report
+            st.write("Cleaning report")
+            st.json({"rows": report.rows, "fixes": report.fixes, "warnings": report.warnings})
+            st.write("Clean preview")
+            st.dataframe(st.session_state.df_clean, use_container_width=True)
+            csv_bytes = st.session_state.df_clean.to_csv(index=False).encode("utf-8")
+            st.download_button("Download cleaned CSV", data=csv_bytes, file_name="cleaned.csv", mime="text/csv")
+with tab2:
+    st.subheader("2) Ask deterministic questions")
+    if st.session_state.df_clean is None:
+        st.warning("Clean your dataset first (Data cleaning tab).")
+    else:
+        question = st.text_input("Ask a question about the dataset", placeholder='e.g., "Names of users in Artificial Intelligence department"')
+        colA, colB = st.columns([1,1])
+        with colA:
+            run = st.button("Run query", type="primary")
+        with colB:
+            show_plan = st.checkbox("Show query plan (JSON)", value=False)
+        if run:
+            if not api_key:
+                st.error("Please add your OpenAI API key in the sidebar.")
+            elif not question.strip():
+                st.error("Please type a question.")
+            else:
+                blocked, msg = basic_injection_check(question)
+                if blocked:
+                    st.error(msg)
+                else:
+                    try:
+                        spec = plan_query_with_llm(question, st.session_state.df_clean, api_key=api_key, model=model)
+                        if show_plan:
+                            st.code(spec.model_dump_json(indent=2), language="json")
+                        result = execute_query(spec, st.session_state.df_clean)
+                        st.write("Result table")
+                        st.dataframe(result, use_container_width=True)
+                        answer = summarize_results_with_llm(question, result, api_key=api_key, model=model)
+                        st.markdown("### Answer")
+                        st.write(answer)
+                    except Exception as e:
+                        st.error(str(e))
+        st.divider()
+        st.markdown("### Why this is accurate")
+        st.markdown("- LLM only creates a small JSON query plan.\n- Pandas executes it deterministically.\n- LLM only summarizes already computed results.")
+from src.agent.graph import run_agent
+import pandas as pd
+with tab3:
+    st.subheader("Self-Correcting Data Validation Agent")
+    st.caption("Paste messy data → Extract JSON → Validate → Auto-correct retries → Final schema-perfect output (NO hallucination)")
+    raw = st.text_area("Paste messy employee data (any format)", height=220)
+    max_attempts = st.slider("Max retries", 1, 6, 3)
+    # --- 1) VISUALIZE STATE MACHINE (diagram) ---
+    st.markdown("### 🧭 State Machine (Extract → Validate → Correct → Finalize)")
+    dot = """
+    digraph G {
+        rankdir=LR;
+        node [shape=box, style="rounded,filled", color="#444444", fillcolor="#F4F6F8"];
+        Extract [label="extract"];
+        Validate [label="validate"];
+        Correct [label="correct"];
+        Finalize [label="finalize"];
+        Extract -> Validate;
+        Validate -> Finalize [label="pass OR max_retries"];
+        Validate -> Correct [label="fail AND retries_left"];
+        Correct -> Validate;
+    }
+    """
+    try:
+        st.graphviz_chart(dot, use_container_width=True)
+    except Exception:
+        st.code(
+            "extract → validate → (pass) finalize\n"
+            "            ↘ (fail) correct → validate (loop)\n",
+            language="text"
+        )
+    if st.button("Run Agent", type="primary"):
+        if not api_key:
+            st.error("Please add your OpenAI API key in the sidebar.")
+            st.stop()
+        if not raw.strip():
+            st.error("Paste some messy data first.")
+            st.stop()
+        with st.spinner("Running extract → validate → correct loop..."):
+            final_state = run_agent(raw, api_key=api_key, model=model, max_attempts=max_attempts)
+        log = final_state.get("log", [])
+        result = final_state.get("result")
+        # --- 3) CORRECTION COUNT SUMMARY (metrics) ---
+        # attempts used
+        attempts_used = max((x.get("attempt", 0) for x in log), default=0)
+        # counts
+        employees_n = len(result.get("employees", [])) if result else 0
+        rejected_n = len(result.get("rejected", [])) if result else 0
+        # how many correct steps happened
+        correct_steps = sum(1 for x in log if x.get("step") == "correct")
+        validate_fails = sum(1 for x in log if x.get("step") == "validate" and x.get("status") == "fail")
+        st.markdown("### 📊 Run Summary")
+        c1, c2, c3, c4 = st.columns(4)
+        c1.metric("Attempts used", attempts_used if attempts_used else 1)
+        c2.metric("Corrections", correct_steps)
+        c3.metric("Valid employees", employees_n)
+        c4.metric("Rejected records", rejected_n)
+        # Optional: show pass/fail clearly
+        if result is None:
+            st.error("Could not produce schema-valid JSON within retry limit.")
+        else:
+            st.success("Schema-valid output ✅")
+        # --- 2) BEFORE / AFTER COMPARISON ---
+        st.markdown("### 🔁 Before vs After")
+        left, right = st.columns(2)
+        with left:
+            st.markdown("#### Before (Raw Input)")
+            st.code(raw.strip(), language="text")
+        with right:
+            st.markdown("#### After (Schema Output)")
+            if result is None:
+                st.code(final_state.get("last_json_text", ""), language="json")
+            else:
+                st.code(json.dumps(result, indent=2, default=str), language="json")
+        # --- Correction Log (keep your existing) ---
+        st.markdown("### 🧾 Correction Log")
+        st.json(log)
+        # If failed, stop here
+        if result is None:
+            st.markdown("### Last JSON Attempt (debug)")
+            st.code(final_state.get("last_json_text", ""), language="json")
+            st.stop()
+        # -------- Valid employees table --------
+        st.markdown("### ✅ Valid Employees")
+        employees = result.get("employees", [])
+        if employees:
+            df_emp = pd.DataFrame(employees)
+            st.dataframe(df_emp, use_container_width=True)
+        else:
+            st.info("No valid employees extracted (all records were rejected).")
+        # -------- Rejected records table --------
+        st.markdown("### 🚫 Rejected Records (No hallucination)")
+        rejected = result.get("rejected", [])
+        if rejected:
+            rej_rows = []
+            for r in rejected:
+                rej_rows.append(
+                    {
+                        "raw_record": r.get("raw_record", ""),
+                        "reasons": "; ".join(r.get("reasons", [])),
+                    }
+                )
+            df_rej = pd.DataFrame(rej_rows)
+            st.dataframe(df_rej, use_container_width=True)
+        else:
+            st.info("No rejected records. Everything was schema-valid.")
+        # -------- Download --------
+        st.download_button(
+            "Download JSON",
+            data=json.dumps(result, indent=2, default=str),
+            file_name="validated_output.json",
+            mime="application/json",
+        )

employee_dataset_50rows.csv ADDED Viewed

	@@ -0,0 +1,51 @@

+user_id,Name,Age,Email,Salary,Join_Date,Department,Performance_Score,Location,Job_Title
+1, Sarah Johnson ,28,sarah.j@techcorp.com,95000,2023-01-15,AI,8.5,New York,Senior Engineer
+2,michael chen,  thirty two ,michael at gmail.com,110000 ,15/02/2023,Data Science,9.2,San Francisco,Lead Analyst
+3,EMILY DAVIS,,emily@techcorp,87000,2023/03/10,ai,7.8,NYC,Engineer
+4,James Wilson,twenty nine,james.wilson@company.com,92000,01-04-2023,Machine Learning,8.7,San Francisco,ML Engineer
+5, Robert  Brown ,35,robert@techcorp.com,$125000,2023-05-20,AI , 9.5,New York,Principal Engineer
+6,Jennifer Lee,27,jennifer@@techcorp.com,NaN,2023-06-01,Data science,8.3,Chicago,Data Scientist
+7,Sarah Johnson,28,sarah.j@techcorp.com,95000,2023-01-15,AI,8.5,New York,Senior Engineer
+8,David Kim, 31, ,103000,2023-07-15,ML,nine,Seattle,Senior ML Engineer
+9,Lisa Anderson,26,lisa.anderson@tech.com,88000,2023-08-01,DataScience,8.9,Boston,Data Engineer
+10,,thirty,maria@techcorp.com,94000,2023-09-10,AI,8.6,New York,Engineer
+11,JOHN MARTINEZ, ,john.m@company.com,98000,12th Oct 2023,Artificial Intelligence,8.2,Chicago,Senior Engineer
+12,Amanda White,29,amanda@techcorp.com,  102000  ,2023-11-01,AI/ML,9.1,San Francisco,Tech Lead
+13,Chris Taylor,33,chris@@company.com,115000,2023-12-01,data science,8.8,Seattle,Principal Analyst
+14,Jessica Moore,twenty five,jessica@techcorp.com,85000USD,2024-01-05,AI,7.9,Boston,Junior Engineer
+15,daniel garcia,30,daniel at company.com,99000,2024-02-01,MachineLearning ,8.4,NYC,ML Engineer
+16,PATRICIA THOMAS,34,patricia@tech,120000,2024-02-15,AI,9.3,San Francisco,Staff Engineer
+17,Ryan Harris,28,ryan.h@techcorp.com,96000,10/03/2024,Data Science,8.5,Chicago,Senior Analyst
+18,Michelle Clark,27,michelle@company.com,$91000,2024-03-20,ai,8.1,New York,Engineer
+19,Kevin Rodriguez,  29  ,kevin@@techcorp.com,105000,01-04-2024,ML,9.0,Seattle,Senior Engineer
+20,Laura Lewis,twenty six,laura.lewis@tech.com,87000,2024-04-15,DataScience,7.7,Boston,Data Analyst
+21, Mark  Walker ,32,mark@techcorp.com,NaN,2024-05-01,AI,8.7,San Francisco,Senior Engineer
+22,Angela Hall,28,angela at gmail.com,93000 ,15/05/2024,Data science,eight,NYC,Data Scientist
+23,Thomas Allen,35,thomas@company,112000,2024/06/01,Artificial Intelligence,9.4,Chicago,Principal Engineer
+24,Karen Young,27,karen.y@techcorp.com,89000,05-06-2024,ai,8.3,Seattle,Engineer
+25,Steven King,thirty one,steven@@tech.com,107000,2024-07-10,Machine Learning,8.9,Boston,ML Engineer
+26,DONNA WRIGHT,29,donna@techcorp.com,$98000,2024-07-20,AI/ML,8.6,San Francisco,Tech Lead
+27,Jason Lopez,26,jason.lopez@company.com,  85000  ,2024-08-01,Data Science,7.8,New York,Junior Analyst
+28,Nancy Hill,thirty,nancy at techcorp.com,101000,15th Aug 2024,ai,9.1,Chicago,Senior Engineer
+29,paul scott,28,paul@tech,95000,2024/09/01,DataScience,8.4,Seattle,Data Engineer
+30,,27,betty@techcorp.com,88000,2024-09-15,ML,8.2,Boston,Engineer
+31,Mark Green,33,mark.g@@company.com,118000USD,01-10-2024,Artificial Intelligence,9.2,San Francisco,Staff Engineer
+32,Sandra Adams,twenty eight,sandra@techcorp.com,92000,2024-10-10,AI,8.0,NYC,Engineer
+33,GEORGE BAKER,29,george at company.com,  106000,2024/10/20,Data science,8.8,Chicago,Senior Analyst
+34,Carol Gonzalez,31,carol@tech.com,$99000,2024-11-01,Machine Learning,8.5,Seattle,ML Engineer
+35,Brian Nelson,26,brian@@techcorp.com,NaN,2024-11-15,ai,7.9,Boston,Junior Engineer
+36, Susan  Carter ,thirty two,susan@company.com,114000,01-12-2024,AI/ML,9.3,San Francisco,Principal Engineer
+37,edward mitchell,27,edward.m@techcorp.com,90000 ,15/12/2024,DataScience,8.3,New York,Data Scientist
+38,Margaret Perez,28,margaret at tech.com,97000,2024/12/20,AI,8.7,Chicago,Senior Engineer
+39,JOSEPH ROBERTS,thirty,joseph@@company.com,108000,2025-01-05,Data Science,nine,Seattle,Tech Lead
+40,Lisa Turner,29,lisa.t@techcorp,103000,2025-01-15,Machine Learning,8.6,Boston,Senior ML Engineer
+41,Frank Phillips,27,frank@techcorp.com,$94000,20/01/2025,ai,8.1,San Francisco,Engineer
+42,Helen Campbell,  26  ,helen at company.com,86000,2025/02/01,DataScience,7.8,NYC,Data Analyst
+43,matthew parker,31,matthew@@tech.com,  111000  ,05-02-2025,Artificial Intelligence,9.0,Chicago,Principal Analyst
+44,,twenty nine,elizabeth@techcorp.com,100000USD,2025-02-15,AI/ML,8.8,Seattle,Tech Lead
+45,ANTHONY EVANS,28,anthony@company,96000,2025/03/01,Data science,8.4,Boston,Senior Analyst
+46, Barbara  Edwards ,30,barbara.e@techcorp.com,NaN,10th Mar 2025,Machine Learning,8.9,San Francisco,ML Engineer
+47,kenneth collins,27,kenneth at tech.com,91000 ,2025-03-20,ai,8.2,New York,Engineer
+48,Dorothy Stewart,thirty three,dorothy@@company.com,$104000,01-04-2025,DataScience,9.1,Chicago,Senior Data Scientist
+49,ANDREW SANCHEZ,26,andrew@techcorp.com,87000,2025/04/10,AI,7.7,Seattle,Junior Engineer
+50,donna morris,29,donna.m at company.com,  99000  ,2025-04-20,Data Science,8.5,Boston,Data Analyst

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit>=1.31
+pandas>=2.0
+python-dateutil>=2.8
+openai>=1.0.0
+pydantic>=2.0
+rapidfuzz>=3.0
+langgraph>=0.2.0
+pytest>=8.0

src/agent/graph.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from __future__ import annotations
+import json
+import math
+from typing import Any, Dict, List, Optional, TypedDict
+from langgraph.graph import StateGraph, END
+from pydantic import ValidationError
+from src.agent.schemas import ExtractedData
+# ---------- JSON safety ----------
+def _json_safe(obj):
+    """Recursively convert NaN/inf to None so payload becomes valid JSON."""
+    if obj is None:
+        return None
+    if isinstance(obj, float):
+        if math.isnan(obj) or math.isinf(obj):
+            return None
+        return obj
+    if isinstance(obj, dict):
+        return {k: _json_safe(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_json_safe(v) for v in obj]
+    return obj
+# ---------- OpenAI helper ----------
+def _openai_client(api_key: str):
+    from openai import OpenAI
+    # robust for flaky networks
+    return OpenAI(api_key=api_key, timeout=60, max_retries=5)
+EXTRACT_SYSTEM = """You are a data extraction + validation agent.
+Your job: convert messy text into STRICT JSON that matches this schema:
+{
+  "employees": [
+    {
+      "user_id": int,
+      "name": string,
+      "age": int|null,
+      "email": string|null,
+      "salary": number|null,
+      "join_date": "YYYY-MM-DD"|null,
+      "department": one of ["Artificial Intelligence","AI/ML","Machine Learning","Data Science"],
+      "performance_score": number|null (0..10),
+      "location": string|null,
+      "job_title": string|null
+    }
+  ],
+  "rejected": [
+    { "raw_record": string, "reasons": [string, ...] }
+  ]
+}
+CRITICAL RULES (NO HALLUCINATION):
+- NEVER invent user_id. If user_id is missing/uncertain, DO NOT guess.
+  Put that record into "rejected" with reason "missing user_id".
+- NEVER guess values from vague text like "maybe", "around", "probably", "approx".
+  Use null for uncertain optional fields.
+- If a record cannot be made schema-valid WITHOUT guessing required fields, reject it.
+- Do not fabricate emails or domains. If email is invalid -> null (or reject only if required, but email is optional here).
+Normalization rules:
+- Output JSON ONLY, no markdown.
+- If a field is missing, set it to null (not empty string).
+- Normalize department values:
+  AI/ai/Artificial Intelligence -> "Artificial Intelligence"
+  AI/ML -> "AI/ML"
+  ML/Machine Learning -> "Machine Learning"
+  DataScience/Data science -> "Data Science"
+- Convert word numbers (e.g., "twenty nine") to integers when clear.
+- Convert dates to ISO YYYY-MM-DD if possible, else null.
+- Salary: remove $ and commas; if missing, null.
+- performance_score must be 0..10; if value is out of range or unclear -> null.
+"""
+CORRECT_SYSTEM = """You are a self-correcting data validation agent.
+You will be given:
+- the previous JSON you produced
+- a validation error message describing why it failed
+Fix the JSON to satisfy the schema.
+CRITICAL RULES (NO HALLUCINATION):
+- NEVER invent user_id. If user_id is missing/uncertain, reject the record instead of guessing.
+- NEVER guess uncertain values (maybe/around/probably). Use null for optional fields.
+- Prefer moving problematic records to "rejected" with clear reasons rather than fabricating data.
+Rules:
+- Output JSON ONLY.
+- Keep valid records in "employees".
+- Put non-fixable records in "rejected" with reasons.
+- Use null for missing fields (not empty strings).
+"""
+# ---------- LangGraph State ----------
+class AgentState(TypedDict):
+    raw_text: str
+    attempt: int
+    max_attempts: int
+    last_json_text: str
+    validation_error: str
+    result: Optional[Dict[str, Any]]
+    log: List[Dict[str, Any]]
+def _llm_extract(state: AgentState, api_key: str, model: str) -> AgentState:
+    client = _openai_client(api_key)
+    payload = {"raw_text": state["raw_text"]}
+    resp = client.responses.create(
+        model=model,
+        input=[
+            {"role": "system", "content": EXTRACT_SYSTEM},
+            {"role": "user", "content": json.dumps(payload)},
+        ],
+        temperature=0,
+        max_output_tokens=1400,
+    )
+    out = (resp.output_text or "").strip()
+    state["last_json_text"] = out
+    state["log"].append({"step": "extract", "attempt": state["attempt"], "output": out[:2000]})
+    return state
+def _validate(state: AgentState) -> AgentState:
+    try:
+        data = ExtractedData.model_validate_json(state["last_json_text"])
+        state["result"] = _json_safe(data.model_dump())
+        state["validation_error"] = ""
+        state["log"].append({"step": "validate", "attempt": state["attempt"], "status": "pass"})
+    except ValidationError as e:
+        state["result"] = None
+        state["validation_error"] = str(e)
+        state["log"].append(
+            {
+                "step": "validate",
+                "attempt": state["attempt"],
+                "status": "fail",
+                "error": state["validation_error"][:2000],
+            }
+        )
+    return state
+def _llm_correct(state: AgentState, api_key: str, model: str) -> AgentState:
+    client = _openai_client(api_key)
+    payload = {
+        "previous_json": state["last_json_text"],
+        "validation_error": state["validation_error"],
+    }
+    resp = client.responses.create(
+        model=model,
+        input=[
+            {"role": "system", "content": CORRECT_SYSTEM},
+            {"role": "user", "content": json.dumps(payload)},
+        ],
+        temperature=0,
+        max_output_tokens=1400,
+    )
+    out = (resp.output_text or "").strip()
+    state["last_json_text"] = out
+    state["log"].append({"step": "correct", "attempt": state["attempt"], "output": out[:2000]})
+    return state
+def _should_retry(state: AgentState) -> str:
+    if state["result"] is not None:
+        return "finalize"
+    if state["attempt"] >= state["max_attempts"]:
+        return "finalize"
+    return "retry"
+def build_graph(api_key: str, model: str):
+    g = StateGraph(AgentState)
+    g.add_node("extract", lambda s: _llm_extract(s, api_key, model))
+    g.add_node("validate", _validate)
+    g.add_node("correct", lambda s: _llm_correct(s, api_key, model))
+    g.set_entry_point("extract")
+    g.add_edge("extract", "validate")
+    g.add_conditional_edges(
+        "validate",
+        _should_retry,
+        {"retry": "correct", "finalize": END},
+    )
+    # after correcting, increment attempt then validate again
+    def inc_attempt(state: AgentState) -> AgentState:
+        state["attempt"] += 1
+        return state
+    g.add_node("inc_attempt", inc_attempt)
+    g.add_edge("correct", "inc_attempt")
+    g.add_edge("inc_attempt", "validate")
+    return g.compile()
+def run_agent(raw_text: str, api_key: str, model: str = "gpt-4.1-mini", max_attempts: int = 3):
+    graph = build_graph(api_key, model)
+    init: AgentState = {
+        "raw_text": raw_text,
+        "attempt": 1,
+        "max_attempts": max_attempts,
+        "last_json_text": "",
+        "validation_error": "",
+        "result": None,
+        "log": [],
+    }
+    final_state = graph.invoke(init)
+    return final_state

src/agent/offline_runner.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from __future__ import annotations
+from typing import List, Dict, Any, Optional
+from pydantic import ValidationError
+from src.agent.schemas import ExtractedData
+def run_offline_agent(raw_text: str, candidate_json_outputs: List[str], max_attempts: int = 3):
+    """
+    Offline testing: instead of calling OpenAI, we feed the agent JSON outputs
+    and see if validation passes, and how many attempts it takes.
+    """
+    log = []
+    last = ""
+    attempt = 1
+    for out in candidate_json_outputs[:max_attempts]:
+        last = out
+        log.append({"step": "extract", "attempt": attempt, "output": out})
+        try:
+            data = ExtractedData.model_validate_json(out)
+            log.append({"step": "validate", "attempt": attempt, "status": "pass"})
+            return {"result": data.model_dump(), "log": log, "last_json_text": last}
+        except ValidationError as e:
+            log.append({"step": "validate", "attempt": attempt, "status": "fail", "error": str(e)})
+            attempt += 1
+    return {"result": None, "log": log, "last_json_text": last}

src/agent/schemas.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from __future__ import annotations
+from datetime import date
+from typing import List, Optional, Literal
+from pydantic import BaseModel, Field, field_validator
+Department = Literal[
+    "Artificial Intelligence",
+    "AI/ML",
+    "Machine Learning",
+    "Data Science",
+]
+class Employee(BaseModel):
+    user_id: int
+    name: str = Field(min_length=1)
+    age: Optional[int] = Field(default=None, ge=16, le=80)
+    email: Optional[str] = None
+    salary: Optional[float] = Field(default=None, ge=0)
+    join_date: Optional[date] = None
+    department: Department
+    performance_score: Optional[float] = Field(default=None, ge=0, le=10)
+    location: Optional[str] = None
+    job_title: Optional[str] = None
+    @field_validator("name")
+    @classmethod
+    def normalize_name(cls, v: str) -> str:
+        v = " ".join(str(v).strip().split())
+        return v.title()
+class RejectedRecord(BaseModel):
+    raw_record: str = Field(min_length=1)
+    reasons: List[str] = Field(min_length=1)
+class ExtractedData(BaseModel):
+    # Valid, schema-compliant records only
+    employees: List[Employee] = Field(default_factory=list)
+    # Records that cannot be made valid WITHOUT guessing required fields
+    rejected: List[RejectedRecord] = Field(default_factory=list)

src/core/cleaning.py ADDED Viewed

	@@ -0,0 +1,244 @@

+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import Dict, List, Tuple, Optional
+import pandas as pd
+from dateutil import parser as dtparser
+from rapidfuzz import process, fuzz
+WORD_NUMS = {
+    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
+    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
+    "eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14,
+    "fifteen": 15, "sixteen": 16, "seventeen": 17, "eighteen": 18, "nineteen": 19,
+    "twenty": 20, "thirty": 30, "forty": 40, "fifty": 50, "sixty": 60
+}
+DEPT_CANON = {
+    "ai": "Artificial Intelligence",
+    "artificial intelligence": "Artificial Intelligence",
+    "ai/ml": "AI/ML",
+    "ml": "Machine Learning",
+    "machine learning": "Machine Learning",
+    "data science": "Data Science",
+    "datascience": "Data Science",
+}
+LOCATION_CANON = {
+    "nyc": "New York",
+    "new york": "New York",
+    "san francisco": "San Francisco",
+    "chicago": "Chicago",
+    "seattle": "Seattle",
+    "boston": "Boston",
+}
+EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$")
+@dataclass
+class CleaningReport:
+    rows: int
+    fixes: Dict[str, int]
+    warnings: List[str]
+def _clean_name(s: str) -> Optional[str]:
+    if s is None or pd.isna(s):
+        return None
+    s = str(s).strip()
+    s = re.sub(r"\s+", " ", s)
+    s = " ".join([w.capitalize() for w in s.split()])
+    return s if s else None
+def _parse_word_number(s: str) -> Optional[int]:
+    if s is None:
+        return None
+    t = str(s).strip().lower()
+    if t == "":
+        return None
+    if re.fullmatch(r"\d+", t):
+        return int(t)
+    parts = re.split(r"[\s\-]+", t)
+    total = 0
+    matched = False
+    for p in parts:
+        if p in WORD_NUMS:
+            total += WORD_NUMS[p]
+            matched = True
+        else:
+            return None
+    return total if matched else None
+def _clean_email(s: str) -> Tuple[str, bool]:
+    if s is None:
+        return "unknown@unknown.com", True
+    t = str(s).strip().lower()
+    if t == "":
+        return "unknown@unknown.com", True
+    t = t.replace(" at ", "@").replace(" dot ", ".")
+    t = t.replace("..", ".")
+    t = re.sub(r"@{2,}", "@", t)
+    if EMAIL_RE.match(t):
+        return t, (t != str(s).strip().lower())
+    if "@" in t and "." not in t.split("@", 1)[1]:
+        return t, False
+    return t, False
+def _clean_salary(s: str) -> Tuple[Optional[float], bool]:
+    if s is None:
+        return None, False
+    t = str(s).strip()
+    if t == "" or t.lower() == "nan":
+        return None, False
+    t2 = re.sub(r"[,$]", "", t)
+    t2 = re.sub(r"usd", "", t2, flags=re.I).strip()
+    try:
+        return float(t2), (t2 != t)
+    except ValueError:
+        return None, False
+def _clean_date(s: str) -> Tuple[Optional[str], bool]:
+    if s is None:
+        return None, False
+    t = str(s).strip()
+    if t == "" or t.lower() == "nan":
+        return None, False
+    try:
+        dt = dtparser.parse(t, dayfirst=False, fuzzy=True)
+        iso = dt.date().isoformat()
+        return iso, (iso != t)
+    except Exception:
+        return None, False
+def _canon_from_map(value: str, mapping: Dict[str, str], threshold: int = 90) -> Tuple[str, bool]:
+    raw = (value or "").strip()
+    if raw == "":
+        return raw, False
+    key = raw.lower()
+    if key in mapping:
+        canon = mapping[key]
+        return canon, canon != raw
+    match = process.extractOne(key, mapping.keys(), scorer=fuzz.ratio)
+    if match and match[1] >= threshold:
+        canon = mapping[match[0]]
+        return canon, canon != raw
+    return raw, False
+def clean_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningReport]:
+    out = df.copy()
+    fixes: Dict[str, int] = {}
+    warnings: List[str] = []
+    out.columns = [c.strip() for c in out.columns]
+    col_map = {c.lower(): c for c in out.columns}
+    name_col = col_map.get("name")
+    age_col = col_map.get("age")
+    email_col = col_map.get("email")
+    salary_col = col_map.get("salary")
+    join_col = col_map.get("join_date") or col_map.get("join date")
+    dept_col = col_map.get("department")
+    perf_col = col_map.get("performance_score") or col_map.get("performance score")
+    loc_col = col_map.get("location")
+    if name_col:
+        before = out[name_col].astype(str).tolist()
+        out[name_col] = out[name_col].apply(_clean_name)
+        fixes["name_normalized"] = sum(b != a for b, a in zip(before, out[name_col].tolist()))
+    if age_col:
+        def clean_age(x):
+            if x is None:
+                return None
+            t = str(x).strip().lower()
+            if t == "":
+                return None
+            n = _parse_word_number(t)
+            if n is not None:
+                return n
+            try:
+                return int(float(t))
+            except Exception:
+                return None
+        before = out[age_col].tolist()
+        out[age_col] = out[age_col].apply(clean_age)
+        fixes["age_parsed"] = sum(b != a for b, a in zip(before, out[age_col].tolist()))
+    if email_col:
+        before = out[email_col].tolist()
+        new_vals, changed = [], 0
+        for v in before:
+            cleaned, did = _clean_email(v)
+            if did:
+                changed += 1
+            new_vals.append(cleaned)
+        out[email_col] = new_vals
+        fixes["email_cleaned"] = changed
+        invalid = [e for e in out[email_col].tolist() if not EMAIL_RE.match(e)]
+        if invalid:
+            warnings.append(f"{len(invalid)} email(s) still look invalid (e.g., '{invalid[0]}').")
+    if salary_col:
+        before = out[salary_col].tolist()
+        vals, changed = [], 0
+        for v in before:
+            s2, did = _clean_salary(v)
+            if did:
+                changed += 1
+            vals.append(s2)
+        out[salary_col] = vals
+        fixes["salary_cleaned"] = changed
+    if join_col:
+        before = out[join_col].tolist()
+        vals, changed = [], 0
+        for v in before:
+            d2, did = _clean_date(v)
+            if did:
+                changed += 1
+            vals.append(d2)
+        out[join_col] = vals
+        fixes["join_date_normalized"] = changed
+    if dept_col:
+        before = out[dept_col].astype(str).tolist()
+        new, changed = [], 0
+        for v in before:
+            canon, did = _canon_from_map(v, DEPT_CANON, threshold=90)
+            if did:
+                changed += 1
+            new.append(canon)
+        out[dept_col] = new
+        fixes["department_standardized"] = changed
+    if loc_col:
+        before = out[loc_col].astype(str).tolist()
+        new, changed = [], 0
+        for v in before:
+            canon, did = _canon_from_map(v, LOCATION_CANON, threshold=88)
+            if did:
+                changed += 1
+            new.append(canon)
+        out[loc_col] = new
+        fixes["location_standardized"] = changed
+    if perf_col:
+        before = out[perf_col].tolist()
+        def clean_perf(x):
+            if x is None:
+                return None
+            t = str(x).strip().lower()
+            if t == "" or t == "nan":
+                return None
+            n = _parse_word_number(t)
+            if n is not None:
+                return float(n)
+            try:
+                return float(t)
+            except Exception:
+                return None
+        out[perf_col] = out[perf_col].apply(clean_perf)
+        fixes["performance_parsed"] = sum(b != a for b, a in zip(before, out[perf_col].tolist()))
+    return out, CleaningReport(rows=len(out), fixes=fixes, warnings=warnings)

src/core/query.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from __future__ import annotations
+from typing import Any, List
+import pandas as pd
+from pydantic import BaseModel, Field, ValidationError
+import math
+class FilterSpec(BaseModel):
+    column: str
+    op: str = Field(..., description="one of: eq, neq, contains, in, gte, lte")
+    value: Any
+class QuerySpec(BaseModel):
+    select: List[str] = Field(default_factory=list)
+    filters: List[FilterSpec] = Field(default_factory=list)
+    distinct: bool = True
+    limit: int = 50
+SYSTEM_PROMPT = """You are a data query planner.
+You receive:
+- user_question
+- available_columns
+- sample_values (small)
+Return ONLY valid JSON for QuerySpec with:
+- select: columns needed
+- filters: list of {column, op, value}
+- distinct: true/false
+- limit: integer <= 200
+Rules:
+- Prefer deterministic, simple filters.
+- If the user asks for a department like "Artificial Intelligence", filter Department equals that exact string.
+- If user says AI department, treat it as Department in ["Artificial Intelligence","AI/ML"] unless user explicitly excludes AI/ML.
+- Do NOT invent columns.
+"""
+def _json_safe(obj):
+    """Recursively convert NaN/inf to None so payload becomes valid JSON."""
+    if obj is None:
+        return None
+    if isinstance(obj, float):
+        if math.isnan(obj) or math.isinf(obj):
+            return None
+        return obj
+    if isinstance(obj, dict):
+        return {k: _json_safe(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_json_safe(v) for v in obj]
+    return obj
+def _openai_client(api_key: str):
+    from openai import OpenAI
+    return OpenAI(api_key=api_key)
+def plan_query_with_llm(user_question: str, df: pd.DataFrame, api_key: str, model: str = "gpt-4.1-mini") -> QuerySpec:
+    cols = list(df.columns)
+    sample_df = df.head(8).copy()
+    sample_df = sample_df.where(pd.notna(sample_df), None)
+    sample = _json_safe(sample_df.to_dict(orient="records"))
+    client = _openai_client(api_key)
+    import json  # add at top of file if not present
+    payload = {
+    "user_question": user_question,
+    "available_columns": cols,
+    "sample_values": sample,
+}
+    resp = client.responses.create(
+    model=model,
+    input=[
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": json.dumps(payload)},
+    ],
+    temperature=0,
+    max_output_tokens=600,
+)
+    text = resp.output_text
+    try:
+        spec = QuerySpec.model_validate_json(text)
+        spec.limit = max(1, min(int(spec.limit), 200))
+        return spec
+    except ValidationError as e:
+        raise ValueError(f"Could not parse model output as QuerySpec JSON. Raw output:\\n{text}\\n\\nError:\\n{e}")
+def execute_query(spec: QuerySpec, df: pd.DataFrame) -> pd.DataFrame:
+    out = df.copy()
+    for f in spec.filters:
+        col = f.column
+        if col not in out.columns:
+            continue
+        op = f.op
+        val = f.value
+        if op == "eq":
+            if val is None:
+               out = out[out[col].isna()]
+            else:
+               out = out[out[col] == val]
+        elif op == "neq":
+            if val is None:
+               out = out[out[col].notna()]
+            else:
+               out = out[out[col] != val]
+        elif op == "contains":
+            out = out[out[col].astype(str).str.contains(str(val), case=False, na=False)]
+        elif op == "in":
+            if not isinstance(val, list):
+                val = [val]
+            out = out[out[col].isin(val)]
+        elif op == "gte":
+            out = out[pd.to_numeric(out[col], errors="coerce") >= float(val)]
+        elif op == "lte":
+            out = out[pd.to_numeric(out[col], errors="coerce") <= float(val)]
+    if spec.select:
+        safe_select = [c for c in spec.select if c in out.columns]
+        out = out[safe_select]
+    if spec.distinct:
+        out = out.drop_duplicates()
+    return out.head(spec.limit)
+def summarize_results_with_llm(user_question: str, result_df: pd.DataFrame, api_key: str, model: str = "gpt-4.1-mini") -> str:
+    client = _openai_client(api_key)
+    safe_df = result_df.copy().where(pd.notna(result_df), None)
+    preview = _json_safe(safe_df.to_dict(orient="records"))
+    import json  # add at top if not present
+    payload = {"question": user_question, "results": preview}
+    resp = client.responses.create(
+    model=model,
+    input=[
+        {"role": "system", "content": "You are a helpful analyst. Summarize results concisely and accurately."},
+        {"role": "user", "content": json.dumps(payload)},
+    ],
+    temperature=0.2,
+    max_output_tokens=500,
+)
+    return resp.output_text

src/core/security.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from __future__ import annotations
+import re
+from typing import Tuple
+INJECTION_PATTERNS = [
+    r"ignore (all|any) previous",
+    r"system prompt",
+    r"reveal.*(key|secret|token)",
+    r"exfiltrat",
+    r"prompt injection",
+]
+def basic_injection_check(user_text: str) -> Tuple[bool, str]:
+    t = (user_text or "").lower()
+    for pat in INJECTION_PATTERNS:
+        if re.search(pat, t):
+            return True, "That request looks like a prompt-injection attempt. I can only answer questions about the uploaded dataset."
+    return False, ""

src/eval/benchmarks.json ADDED Viewed

	@@ -0,0 +1,105 @@

+{
+  "dataset_expectations": {
+    "notes": "These benchmarks assume a cleaned dataset with standardized Department values."
+  },
+  "cases": [
+    {
+      "id": "ai_dept_names_including_aiml",
+      "question": "Can you tell me the name of users working in Artificial Intelligence department from all the data entries?",
+      "mode": "spec",
+      "spec": {
+        "select": [
+          "Name"
+        ],
+        "filters": [
+          {
+            "column": "Department",
+            "op": "in",
+            "value": [
+              "Artificial Intelligence",
+              "AI/ML"
+            ]
+          }
+        ],
+        "distinct": true,
+        "limit": 200
+      },
+      "expected": {
+        "type": "set_equals",
+        "column": "Name",
+        "values": [
+          "Sarah Johnson",
+          "Emily Davis",
+          "Robert Brown",
+          "John Martinez",
+          "Amanda White",
+          "Jessica Moore",
+          "Patricia Thomas",
+          "Michelle Clark",
+          "Mark Walker",
+          "Thomas Allen",
+          "Karen Young",
+          "Donna Wright",
+          "Nancy Hill",
+          "Mark Green",
+          "Sandra Adams",
+          "Brian Nelson",
+          "Susan Carter",
+          "Margaret Perez",
+          "Frank Phillips",
+          "Matthew Parker",
+          "Kenneth Collins",
+          "Andrew Sanchez"
+        ]
+      }
+    },
+    {
+      "id": "high_performers_ge_9",
+      "question": "Show employees with performance score >= 9",
+      "mode": "spec",
+      "spec": {
+        "select": [
+          "Name",
+          "Department",
+          "Performance_Score"
+        ],
+        "filters": [
+          {
+            "column": "Performance_Score",
+            "op": "gte",
+            "value": 9
+          }
+        ],
+        "distinct": true,
+        "limit": 200
+      },
+      "expected": {
+        "type": "row_count_gte",
+        "min_rows": 1
+      }
+    },
+    {
+      "id": "salary_missing",
+      "question": "How many employees have missing salary?",
+      "mode": "spec",
+      "spec": {
+        "select": [
+          "Salary"
+        ],
+        "filters": [
+          {
+            "column": "Salary",
+            "op": "eq",
+            "value": null
+          }
+        ],
+        "distinct": false,
+        "limit": 1000
+      },
+      "expected": {
+        "type": "row_count_gte",
+        "min_rows": 1
+      }
+    }
+  ]
+}

src/eval/run_agent_suite.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+Run the agent on all files in test_inputs/ and print a clear summary.
+Usage:
+    export OPENAI_API_KEY="sk-..."
+    python -m src.eval.run_agent_suite
+"""
+from pathlib import Path
+import json
+import os
+from statistics import mean
+from src.agent.graph import run_agent
+TEST_DIR = Path("test_inputs")
+MODEL = "gpt-4.1-mini"
+MAX_ATTEMPTS = 4
+def main():
+    api_key = os.environ.get("OPENAI_API_KEY", "").strip()
+    if not api_key:
+        raise SystemExit(
+            "OPENAI_API_KEY is not set. Export your key before running the suite:\n\n"
+            "export OPENAI_API_KEY=\"sk-...\""
+        )
+    inputs = sorted(TEST_DIR.glob("*.txt"))
+    if not inputs:
+        raise SystemExit("No test_inputs/*.txt files found. Create the sample inputs first.")
+    total = 0
+    passed_count = 0
+    correct_handling = 0
+    attempts_hist = []
+    attempts_hist_for_passed = []
+    # OPTIONAL: define expected behavior per file (True=expected PASS/valid handling)
+    # For correctness metric we will consider "handled correctly" as either:
+    # - produced employees & they match expectations (not available here), or
+    # - produced 0 employees and non-empty rejected (for cases that should be rejected)
+    # You can expand expected_outcomes if you want to mark specific cases as expected_fail, etc.
+    expected_outcomes = {
+        # "case09_no_ids.txt": "reject",  # example: expected to reject (no user_id)
+        # "case15_extreme_noise.txt": "reject",
+    }
+    for p in inputs:
+        total += 1
+        raw = p.read_text()
+        print(f"\n=== Running: {p.name} ===")
+        try:
+            final = run_agent(raw, api_key=api_key, model=MODEL, max_attempts=MAX_ATTEMPTS)
+        except Exception as e:
+            print(f"[ERROR] agent crashed for {p.name}: {e}")
+            continue
+        result = final.get("result")
+        log = final.get("log", [])
+        # attempts used is the max attempt number seen in log, fall back to 0
+        attempts_used = max((entry.get("attempt", 0) for entry in log), default=0)
+        employees_n = 0
+        rejected_n = 0
+        if result:
+            employees_n = len(result.get("employees", []))
+            rejected_n = len(result.get("rejected", []))
+        # Define pass = result is not None (valid schema produced)
+        passed = result is not None
+        # Print a concise line
+        print(f"{p.name}: {'PASS' if passed else 'FAIL'} | attempts={attempts_used} | employees={employees_n} | rejected={rejected_n}")
+        # Print extra info for failures or suspicious cases
+        if not passed:
+            print("-> Agent failed to produce schema-valid JSON within retry limit.")
+            print("Last JSON attempt:")
+            print(final.get("last_json_text", ""))
+        else:
+            # Optionally print the JSON for inspection of suspicious cases
+            if employees_n == 0 and rejected_n > 0:
+                print("-> No valid employees extracted; records were rejected (no hallucination).")
+            # You can uncomment to always show the JSON
+            # print(json.dumps(result, indent=2))
+        # Evaluate "correct handling" heuristically:
+        # if expected_outcomes says "reject" and the agent indeed rejected (employees==0 and rejected>0) -> correct
+        expected = expected_outcomes.get(p.name)
+        handled_correctly = False
+        if expected == "reject":
+            handled_correctly = (employees_n == 0 and rejected_n > 0)
+        else:
+            # default heuristic: producing a schema result is considered handling (but inspect counts)
+            handled_correctly = passed
+        if handled_correctly:
+            correct_handling += 1
+        if passed:
+            passed_count += 1
+            attempts_hist_for_passed.append(attempts_used)
+        attempts_hist.append(attempts_used)
+    # Summary
+    print("\n=== SUITE SUMMARY ===")
+    print(f"Total cases: {total}")
+    print(f"Schema-valid produced (pass): {passed_count}/{total} = {passed_count/total:.2%}")
+    print(f"Correct-handling (heuristic expected): {correct_handling}/{total} = {correct_handling/total:.2%}")
+    if attempts_hist:
+        print(f"Avg attempts (all cases): {mean(attempts_hist):.2f}")
+    if attempts_hist_for_passed:
+        print(f"Avg attempts (passed cases): {mean(attempts_hist_for_passed):.2f}")
+if __name__ == "__main__":
+    main()

src/eval/run_eval.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from __future__ import annotations
+import argparse
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import pandas as pd
+from src.core.cleaning import clean_dataframe
+from src.core.query import QuerySpec, FilterSpec, execute_query, plan_query_with_llm
+@dataclass
+class CaseResult:
+    case_id: str
+    passed: bool
+    details: str
+def _load_benchmarks(path: Path) -> Dict[str, Any]:
+    return json.loads(path.read_text())
+def _spec_from_dict(d: Dict[str, Any]) -> QuerySpec:
+    filters = [FilterSpec(**f) for f in d.get("filters", [])]
+    return QuerySpec(
+        select=d.get("select", []),
+        filters=filters,
+        distinct=bool(d.get("distinct", True)),
+        limit=int(d.get("limit", 50)),
+    )
+def _check_expected(result_df: pd.DataFrame, expected: Dict[str, Any]) -> Tuple[bool, str]:
+    et = expected.get("type")
+    if et == "set_equals":
+        col = expected["column"]
+        want = set(expected["values"])
+        if col not in result_df.columns:
+            return False, f"Missing expected column '{col}'. Columns: {list(result_df.columns)}"
+        got = set([x for x in result_df[col].dropna().astype(str).tolist()])
+        missing = want - got
+        extra = got - want
+        if missing or extra:
+            return False, f"Set mismatch. Missing={sorted(missing)} Extra={sorted(extra)}"
+        return True, "OK"
+    if et == "row_count_gte":
+        min_rows = int(expected["min_rows"])
+        n = len(result_df)
+        return (n >= min_rows), f"Rows={n}, expected >= {min_rows}"
+    if et == "row_count_equals":
+        want = int(expected["rows"])
+        n = len(result_df)
+        return (n == want), f"Rows={n}, expected == {want}"
+    return False, f"Unknown expected.type '{et}'"
+def run(args: argparse.Namespace) -> int:
+    bench = _load_benchmarks(Path(args.benchmarks))
+    df_raw = pd.read_csv(args.csv)
+    df, report = clean_dataframe(df_raw)
+    results: List[CaseResult] = []
+    for case in bench["cases"]:
+        cid = case["id"]
+        mode = case.get("mode", "spec")
+        expected = case["expected"]
+        try:
+            if mode == "spec":
+                spec = _spec_from_dict(case["spec"])
+            elif mode == "llm":
+                if not args.api_key and not os.getenv("OPENAI_API_KEY"):
+                    results.append(CaseResult(cid, False, "No API key for LLM mode"))
+                    continue
+                api_key = args.api_key or os.getenv("OPENAI_API_KEY", "")
+                spec = plan_query_with_llm(case["question"], df, api_key=api_key, model=args.model)
+            else:
+                results.append(CaseResult(cid, False, f"Unknown mode '{mode}'"))
+                continue
+            out = execute_query(spec, df)
+            ok, details = _check_expected(out, expected)
+            results.append(CaseResult(cid, ok, details))
+        except Exception as e:
+            results.append(CaseResult(cid, False, f"Exception: {e}"))
+    passed = sum(1 for r in results if r.passed)
+    total = len(results)
+    print("\n=== Cleaning report ===")
+    print({"rows": report.rows, "fixes": report.fixes, "warnings": report.warnings})
+    print("\n=== Benchmark results ===")
+    for r in results:
+        status = "PASS" if r.passed else "FAIL"
+        print(f"[{status}] {r.case_id}: {r.details}")
+    print(f"\nSummary: {passed}/{total} passed")
+    return 0 if passed == total else 1
+if __name__ == "__main__":
+    p = argparse.ArgumentParser(description="Run benchmark evaluation for the AI Data Validation Agent")
+    p.add_argument("--csv", required=True, help="Path to CSV dataset")
+    p.add_argument("--benchmarks", default="src/eval/benchmarks.json", help="Path to benchmarks.json")
+    p.add_argument("--api-key", default="", help="OpenAI API key (optional; only needed for llm-mode cases)")
+    p.add_argument("--model", default="gpt-4.1-mini", help="Model for llm-mode cases")
+    raise SystemExit(run(p.parse_args()))

src/tests/test_query.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import pandas as pd
+from src.core.query import QuerySpec, FilterSpec, execute_query
+def test_execute_query_eq():
+    df = pd.DataFrame([
+        {"Name":"A", "Department":"Artificial Intelligence"},
+        {"Name":"B", "Department":"Data Science"},
+    ])
+    spec = QuerySpec(select=["Name"], filters=[FilterSpec(column="Department", op="eq", value="Artificial Intelligence")])
+    out = execute_query(spec, df)
+    assert out["Name"].tolist() == ["A"]

src/tests_agent/test_offline_runner.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import json
+from src.agent.offline_runner import run_offline_agent
+def test_offline_runner_fails_then_passes():
+    bad = json.dumps({
+        "employees": [{
+            "user_id": "101",   # wrong type
+            "name": "",         # invalid (min length)
+            "department": "ai"  # invalid enum
+        }]
+    })
+    good = json.dumps({
+        "employees": [{
+            "user_id": 101,
+            "name": "Michael Chen",
+            "age": 29,
+            "email": None,
+            "salary": 120000,
+            "join_date": None,
+            "department": "Artificial Intelligence",
+            "performance_score": None,
+            "location": None,
+            "job_title": None
+        }]
+    })
+    out = run_offline_agent("raw", [bad, good], max_attempts=2)
+    assert out["result"] is not None
+    assert len(out["log"]) >= 3

src/tests_agent/tests_schema.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from src.agent.schemas import ExtractedData
+def test_schema_accepts_valid_employee():
+    payload = {
+        "employees": [
+            {
+                "user_id": 101,
+                "name": "Michael Chen",
+                "age": 29,
+                "email": None,
+                "salary": 120000,
+                "join_date": "2024-01-01",
+                "department": "Artificial Intelligence",
+                "performance_score": 9.5,
+                "location": "Chicago",
+                "job_title": "Engineer",
+            }
+        ]
+    }
+    data = ExtractedData.model_validate(payload)
+    assert data.employees[0].user_id == 101

test_inputs/case01_simple.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+ID: 101
+Name: michael chen
+Age: twenty nine
+Dept: ai
+Salary: $120,000
+Performance: 11
+Email: michael at gmail.com

test_inputs/case02_word_numbers.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+ID: 102
+Name: maria lopez
+Age: thirty two
+Dept: ai
+Salary: $120,000
+Performance: nine
+Email: maria at gmail.com

test_inputs/case03_missing_fields.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+ID: 103
+Name: kevin brown
+Dept: Data Science
+Salary: 88000

test_inputs/case04_bad_performance.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+ID: 104
+Name: Alice Green
+Age: 27
+Dept: ML
+Salary: 100000
+Performance: 15
+Email: alice@company.com

test_inputs/case05_weird_date.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+ID: 105
+Name: David Chen
+Age: 29
+Dept: AI/ML
+Salary: 99000
+Join: 14th Feb 2024
+Performance: 7.8

test_inputs/case06_random_noise.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+### Employee Record ###
+blah blah
+ID => 106
+Employee Name: robert taylor
+AGE: 31
+Department: machine learning
+salary approx $110,000 per year
+performance rating 8
+END RECORD

test_inputs/case07_duplicate_fields.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+ID: 107
+Name: Emma Watson
+Name: emma watson
+Age: 26
+Dept: DataScience
+Salary: $87000
+Performance: 8.2

test_inputs/case08_two_records.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+Employee 1:
+ID: 201
+Name: michael scott
+Age: forty five
+Dept: ai
+Salary: 150000
+Performance: 9
+Employee 2:
+ID: 202
+Name: jim halpert
+Age: 30
+Department: sales
+Salary: 85000
+Performance: 7

test_inputs/case09_no_ids.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Name: Sarah Connor
+Age: 33
+Dept: Artificial Intelligence
+Salary: 115000
+Performance: 8.7

test_inputs/case10_conflicting_values.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+ID: 301
+Name: Peter Parker
+Age: 25
+Age: 40
+Dept: AI
+Salary: 70000
+Performance: 6

test_inputs/case11_unstructured_paragraph.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+Michael Jordan joined the company recently. His ID is 401.
+He works in AI. He earns around $130,000 annually.
+Performance is 9.2. He is 35 years old.

test_inputs/case12_partial_sentences.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 402, linda jones, thirty, data science, 92000, performance 8

test_inputs/case13_garbage_mixed.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+%%%%%%%
+Random text here.
+Employee start.
+ID: 501
+Name: Tony Stark
+Age: genius
+Dept: AI
+Salary: 300000
+Performance: 12
+Email: tony@starkindustries
+END

test_inputs/case14_large_block.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+ID: 601 Name: A Age: 25 Dept: AI Salary: 90000 Performance: 7
+ID: 602 Name: B Age: 26 Dept: AI Salary: 91000 Performance: 8
+ID: 603 Name: C Age: 27 Dept: AI Salary: 92000 Performance: 9
+ID: 604 Name: D Age: 28 Dept: AI Salary: 93000 Performance: 6
+ID: 605 Name: E Age: 29 Dept: AI Salary: 94000 Performance: 5

test_inputs/case15_extreme_noise.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+I have an employee somewhere. ID maybe 701?
+He is probably 28-ish. Works in something like AI.
+Salary could be 100k?? Not sure.
+Performance: maybe 8 or 9.