Siddhesh Patil commited on
Commit ·
b67668b
0
Parent(s):
Initial commit - Self-Correcting Data Validation Agent
Browse files- .gitignore +6 -0
- README.md +38 -0
- app.py +236 -0
- employee_dataset_50rows.csv +51 -0
- requirements.txt +8 -0
- src/agent/graph.py +225 -0
- src/agent/offline_runner.py +27 -0
- src/agent/schemas.py +45 -0
- src/core/cleaning.py +244 -0
- src/core/query.py +155 -0
- src/core/security.py +19 -0
- src/eval/benchmarks.json +105 -0
- src/eval/run_agent_suite.py +119 -0
- src/eval/run_eval.py +113 -0
- src/tests/test_query.py +11 -0
- src/tests_agent/test_offline_runner.py +30 -0
- src/tests_agent/tests_schema.py +22 -0
- test_inputs/case01_simple.txt +7 -0
- test_inputs/case02_word_numbers.txt +7 -0
- test_inputs/case03_missing_fields.txt +4 -0
- test_inputs/case04_bad_performance.txt +7 -0
- test_inputs/case05_weird_date.txt +7 -0
- test_inputs/case06_random_noise.txt +9 -0
- test_inputs/case07_duplicate_fields.txt +7 -0
- test_inputs/case08_two_records.txt +15 -0
- test_inputs/case09_no_ids.txt +5 -0
- test_inputs/case10_conflicting_values.txt +7 -0
- test_inputs/case11_unstructured_paragraph.txt +3 -0
- test_inputs/case12_partial_sentences.txt +1 -0
- test_inputs/case13_garbage_mixed.txt +11 -0
- test_inputs/case14_large_block.txt +5 -0
- test_inputs/case15_extreme_noise.txt +4 -0
.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
.env
|
| 4 |
+
.env.*
|
| 5 |
+
*.pyc
|
| 6 |
+
.DS_Store
|
README.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AI Data Validation Agent (Industry-ready)
|
| 2 |
+
|
| 3 |
+
A Streamlit app that:
|
| 4 |
+
- Cleans & normalizes messy HR-like tabular data (emails, dates, ages, salaries, departments)
|
| 5 |
+
- Produces an **audit report** of what was fixed
|
| 6 |
+
- Answers natural-language questions **deterministically** using pandas (not "LLM guesses")
|
| 7 |
+
- Uses an LLM only to convert the user's question into a small JSON query spec + to explain results
|
| 8 |
+
|
| 9 |
+
## Run locally
|
| 10 |
+
|
| 11 |
+
```bash
|
| 12 |
+
python -m venv .venv && source .venv/bin/activate
|
| 13 |
+
pip install -r requirements.txt
|
| 14 |
+
streamlit run app.py
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
## Environment variables
|
| 18 |
+
|
| 19 |
+
Set your OpenAI key in **Streamlit sidebar** or via env var:
|
| 20 |
+
|
| 21 |
+
```bash
|
| 22 |
+
export OPENAI_API_KEY="..."
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
## Evaluation (accuracy benchmarks)
|
| 27 |
+
|
| 28 |
+
Run deterministic benchmarks (no LLM required):
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
python -m venv .venv && source .venv/bin/activate
|
| 32 |
+
pip install -r requirements.txt
|
| 33 |
+
python -m src.eval.run_eval --csv your_data.csv
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
Optional end-to-end LLM evaluation:
|
| 37 |
+
- Set `OPENAI_API_KEY`, and change a benchmark case `mode` to `"llm"` in `src/eval/benchmarks.json`.
|
| 38 |
+
- Then re-run the command above.
|
app.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from src.agent.graph import run_agent
|
| 6 |
+
from src.core.cleaning import clean_dataframe
|
| 7 |
+
from src.core.query import plan_query_with_llm, execute_query, summarize_results_with_llm
|
| 8 |
+
from src.core.security import basic_injection_check
|
| 9 |
+
|
| 10 |
+
st.set_page_config(page_title="AI Data Validation Agent (Industry-ready)", page_icon="🤖", layout="wide")
|
| 11 |
+
|
| 12 |
+
st.title("🤖 AI Data Validation Agent (Industry-ready)")
|
| 13 |
+
st.caption("Deterministic pandas answers + LLM planning/summarization (no LLM guessing).")
|
| 14 |
+
|
| 15 |
+
with st.sidebar:
|
| 16 |
+
st.header("Configuration")
|
| 17 |
+
api_key = st.text_input("OpenAI API Key", value=os.getenv("OPENAI_API_KEY",""), type="password")
|
| 18 |
+
model = st.selectbox("Model", ["gpt-4.1-mini","gpt-4.1","gpt-4o-mini","gpt-4o"], index=0)
|
| 19 |
+
st.divider()
|
| 20 |
+
uploaded = st.file_uploader("Upload CSV", type=["csv"])
|
| 21 |
+
st.divider()
|
| 22 |
+
st.markdown("**Tip:** Clean the data first, then ask questions.")
|
| 23 |
+
|
| 24 |
+
if "df_raw" not in st.session_state:
|
| 25 |
+
st.session_state.df_raw = None
|
| 26 |
+
if "df_clean" not in st.session_state:
|
| 27 |
+
st.session_state.df_clean = None
|
| 28 |
+
if "clean_report" not in st.session_state:
|
| 29 |
+
st.session_state.clean_report = None
|
| 30 |
+
|
| 31 |
+
tab1, tab2, tab3 = st.tabs(["📄 Data cleaning", "💬 Ask questions", "🧠 Self-correcting agent"])
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
with tab1:
|
| 35 |
+
st.subheader("1) Upload & clean")
|
| 36 |
+
if uploaded is None:
|
| 37 |
+
st.info("Upload a CSV from the sidebar.")
|
| 38 |
+
else:
|
| 39 |
+
df = pd.read_csv(uploaded)
|
| 40 |
+
st.session_state.df_raw = df
|
| 41 |
+
st.write("Raw preview")
|
| 42 |
+
st.dataframe(df.head(20), use_container_width=True)
|
| 43 |
+
|
| 44 |
+
if st.button("Clean & normalize", type="primary"):
|
| 45 |
+
dfc, report = clean_dataframe(df)
|
| 46 |
+
st.session_state.df_clean = dfc
|
| 47 |
+
st.session_state.clean_report = report
|
| 48 |
+
|
| 49 |
+
if st.session_state.df_clean is not None:
|
| 50 |
+
st.success("Cleaned dataset ready ✅")
|
| 51 |
+
report = st.session_state.clean_report
|
| 52 |
+
st.write("Cleaning report")
|
| 53 |
+
st.json({"rows": report.rows, "fixes": report.fixes, "warnings": report.warnings})
|
| 54 |
+
st.write("Clean preview")
|
| 55 |
+
st.dataframe(st.session_state.df_clean, use_container_width=True)
|
| 56 |
+
|
| 57 |
+
csv_bytes = st.session_state.df_clean.to_csv(index=False).encode("utf-8")
|
| 58 |
+
st.download_button("Download cleaned CSV", data=csv_bytes, file_name="cleaned.csv", mime="text/csv")
|
| 59 |
+
|
| 60 |
+
with tab2:
|
| 61 |
+
st.subheader("2) Ask deterministic questions")
|
| 62 |
+
if st.session_state.df_clean is None:
|
| 63 |
+
st.warning("Clean your dataset first (Data cleaning tab).")
|
| 64 |
+
else:
|
| 65 |
+
question = st.text_input("Ask a question about the dataset", placeholder='e.g., "Names of users in Artificial Intelligence department"')
|
| 66 |
+
colA, colB = st.columns([1,1])
|
| 67 |
+
with colA:
|
| 68 |
+
run = st.button("Run query", type="primary")
|
| 69 |
+
with colB:
|
| 70 |
+
show_plan = st.checkbox("Show query plan (JSON)", value=False)
|
| 71 |
+
|
| 72 |
+
if run:
|
| 73 |
+
if not api_key:
|
| 74 |
+
st.error("Please add your OpenAI API key in the sidebar.")
|
| 75 |
+
elif not question.strip():
|
| 76 |
+
st.error("Please type a question.")
|
| 77 |
+
else:
|
| 78 |
+
blocked, msg = basic_injection_check(question)
|
| 79 |
+
if blocked:
|
| 80 |
+
st.error(msg)
|
| 81 |
+
else:
|
| 82 |
+
try:
|
| 83 |
+
spec = plan_query_with_llm(question, st.session_state.df_clean, api_key=api_key, model=model)
|
| 84 |
+
if show_plan:
|
| 85 |
+
st.code(spec.model_dump_json(indent=2), language="json")
|
| 86 |
+
|
| 87 |
+
result = execute_query(spec, st.session_state.df_clean)
|
| 88 |
+
|
| 89 |
+
st.write("Result table")
|
| 90 |
+
st.dataframe(result, use_container_width=True)
|
| 91 |
+
|
| 92 |
+
answer = summarize_results_with_llm(question, result, api_key=api_key, model=model)
|
| 93 |
+
st.markdown("### Answer")
|
| 94 |
+
st.write(answer)
|
| 95 |
+
|
| 96 |
+
except Exception as e:
|
| 97 |
+
st.error(str(e))
|
| 98 |
+
|
| 99 |
+
st.divider()
|
| 100 |
+
st.markdown("### Why this is accurate")
|
| 101 |
+
st.markdown("- LLM only creates a small JSON query plan.\n- Pandas executes it deterministically.\n- LLM only summarizes already computed results.")
|
| 102 |
+
from src.agent.graph import run_agent
|
| 103 |
+
import pandas as pd
|
| 104 |
+
|
| 105 |
+
with tab3:
|
| 106 |
+
st.subheader("Self-Correcting Data Validation Agent")
|
| 107 |
+
st.caption("Paste messy data → Extract JSON → Validate → Auto-correct retries → Final schema-perfect output (NO hallucination)")
|
| 108 |
+
|
| 109 |
+
raw = st.text_area("Paste messy employee data (any format)", height=220)
|
| 110 |
+
max_attempts = st.slider("Max retries", 1, 6, 3)
|
| 111 |
+
|
| 112 |
+
# --- 1) VISUALIZE STATE MACHINE (diagram) ---
|
| 113 |
+
st.markdown("### 🧭 State Machine (Extract → Validate → Correct → Finalize)")
|
| 114 |
+
|
| 115 |
+
dot = """
|
| 116 |
+
digraph G {
|
| 117 |
+
rankdir=LR;
|
| 118 |
+
node [shape=box, style="rounded,filled", color="#444444", fillcolor="#F4F6F8"];
|
| 119 |
+
|
| 120 |
+
Extract [label="extract"];
|
| 121 |
+
Validate [label="validate"];
|
| 122 |
+
Correct [label="correct"];
|
| 123 |
+
Finalize [label="finalize"];
|
| 124 |
+
|
| 125 |
+
Extract -> Validate;
|
| 126 |
+
Validate -> Finalize [label="pass OR max_retries"];
|
| 127 |
+
Validate -> Correct [label="fail AND retries_left"];
|
| 128 |
+
Correct -> Validate;
|
| 129 |
+
}
|
| 130 |
+
"""
|
| 131 |
+
try:
|
| 132 |
+
st.graphviz_chart(dot, use_container_width=True)
|
| 133 |
+
except Exception:
|
| 134 |
+
st.code(
|
| 135 |
+
"extract → validate → (pass) finalize\n"
|
| 136 |
+
" ↘ (fail) correct → validate (loop)\n",
|
| 137 |
+
language="text"
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
if st.button("Run Agent", type="primary"):
|
| 141 |
+
if not api_key:
|
| 142 |
+
st.error("Please add your OpenAI API key in the sidebar.")
|
| 143 |
+
st.stop()
|
| 144 |
+
if not raw.strip():
|
| 145 |
+
st.error("Paste some messy data first.")
|
| 146 |
+
st.stop()
|
| 147 |
+
|
| 148 |
+
with st.spinner("Running extract → validate → correct loop..."):
|
| 149 |
+
final_state = run_agent(raw, api_key=api_key, model=model, max_attempts=max_attempts)
|
| 150 |
+
|
| 151 |
+
log = final_state.get("log", [])
|
| 152 |
+
result = final_state.get("result")
|
| 153 |
+
|
| 154 |
+
# --- 3) CORRECTION COUNT SUMMARY (metrics) ---
|
| 155 |
+
# attempts used
|
| 156 |
+
attempts_used = max((x.get("attempt", 0) for x in log), default=0)
|
| 157 |
+
|
| 158 |
+
# counts
|
| 159 |
+
employees_n = len(result.get("employees", [])) if result else 0
|
| 160 |
+
rejected_n = len(result.get("rejected", [])) if result else 0
|
| 161 |
+
|
| 162 |
+
# how many correct steps happened
|
| 163 |
+
correct_steps = sum(1 for x in log if x.get("step") == "correct")
|
| 164 |
+
validate_fails = sum(1 for x in log if x.get("step") == "validate" and x.get("status") == "fail")
|
| 165 |
+
|
| 166 |
+
st.markdown("### 📊 Run Summary")
|
| 167 |
+
c1, c2, c3, c4 = st.columns(4)
|
| 168 |
+
c1.metric("Attempts used", attempts_used if attempts_used else 1)
|
| 169 |
+
c2.metric("Corrections", correct_steps)
|
| 170 |
+
c3.metric("Valid employees", employees_n)
|
| 171 |
+
c4.metric("Rejected records", rejected_n)
|
| 172 |
+
|
| 173 |
+
# Optional: show pass/fail clearly
|
| 174 |
+
if result is None:
|
| 175 |
+
st.error("Could not produce schema-valid JSON within retry limit.")
|
| 176 |
+
else:
|
| 177 |
+
st.success("Schema-valid output ✅")
|
| 178 |
+
|
| 179 |
+
# --- 2) BEFORE / AFTER COMPARISON ---
|
| 180 |
+
st.markdown("### 🔁 Before vs After")
|
| 181 |
+
left, right = st.columns(2)
|
| 182 |
+
|
| 183 |
+
with left:
|
| 184 |
+
st.markdown("#### Before (Raw Input)")
|
| 185 |
+
st.code(raw.strip(), language="text")
|
| 186 |
+
|
| 187 |
+
with right:
|
| 188 |
+
st.markdown("#### After (Schema Output)")
|
| 189 |
+
if result is None:
|
| 190 |
+
st.code(final_state.get("last_json_text", ""), language="json")
|
| 191 |
+
else:
|
| 192 |
+
st.code(json.dumps(result, indent=2, default=str), language="json")
|
| 193 |
+
|
| 194 |
+
# --- Correction Log (keep your existing) ---
|
| 195 |
+
st.markdown("### 🧾 Correction Log")
|
| 196 |
+
st.json(log)
|
| 197 |
+
|
| 198 |
+
# If failed, stop here
|
| 199 |
+
if result is None:
|
| 200 |
+
st.markdown("### Last JSON Attempt (debug)")
|
| 201 |
+
st.code(final_state.get("last_json_text", ""), language="json")
|
| 202 |
+
st.stop()
|
| 203 |
+
|
| 204 |
+
# -------- Valid employees table --------
|
| 205 |
+
st.markdown("### ✅ Valid Employees")
|
| 206 |
+
employees = result.get("employees", [])
|
| 207 |
+
if employees:
|
| 208 |
+
df_emp = pd.DataFrame(employees)
|
| 209 |
+
st.dataframe(df_emp, use_container_width=True)
|
| 210 |
+
else:
|
| 211 |
+
st.info("No valid employees extracted (all records were rejected).")
|
| 212 |
+
|
| 213 |
+
# -------- Rejected records table --------
|
| 214 |
+
st.markdown("### 🚫 Rejected Records (No hallucination)")
|
| 215 |
+
rejected = result.get("rejected", [])
|
| 216 |
+
if rejected:
|
| 217 |
+
rej_rows = []
|
| 218 |
+
for r in rejected:
|
| 219 |
+
rej_rows.append(
|
| 220 |
+
{
|
| 221 |
+
"raw_record": r.get("raw_record", ""),
|
| 222 |
+
"reasons": "; ".join(r.get("reasons", [])),
|
| 223 |
+
}
|
| 224 |
+
)
|
| 225 |
+
df_rej = pd.DataFrame(rej_rows)
|
| 226 |
+
st.dataframe(df_rej, use_container_width=True)
|
| 227 |
+
else:
|
| 228 |
+
st.info("No rejected records. Everything was schema-valid.")
|
| 229 |
+
|
| 230 |
+
# -------- Download --------
|
| 231 |
+
st.download_button(
|
| 232 |
+
"Download JSON",
|
| 233 |
+
data=json.dumps(result, indent=2, default=str),
|
| 234 |
+
file_name="validated_output.json",
|
| 235 |
+
mime="application/json",
|
| 236 |
+
)
|
employee_dataset_50rows.csv
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
user_id,Name,Age,Email,Salary,Join_Date,Department,Performance_Score,Location,Job_Title
|
| 2 |
+
1, Sarah Johnson ,28,sarah.j@techcorp.com,95000,2023-01-15,AI,8.5,New York,Senior Engineer
|
| 3 |
+
2,michael chen, thirty two ,michael at gmail.com,110000 ,15/02/2023,Data Science,9.2,San Francisco,Lead Analyst
|
| 4 |
+
3,EMILY DAVIS,,emily@techcorp,87000,2023/03/10,ai,7.8,NYC,Engineer
|
| 5 |
+
4,James Wilson,twenty nine,james.wilson@company.com,92000,01-04-2023,Machine Learning,8.7,San Francisco,ML Engineer
|
| 6 |
+
5, Robert Brown ,35,robert@techcorp.com,$125000,2023-05-20,AI , 9.5,New York,Principal Engineer
|
| 7 |
+
6,Jennifer Lee,27,jennifer@@techcorp.com,NaN,2023-06-01,Data science,8.3,Chicago,Data Scientist
|
| 8 |
+
7,Sarah Johnson,28,sarah.j@techcorp.com,95000,2023-01-15,AI,8.5,New York,Senior Engineer
|
| 9 |
+
8,David Kim, 31, ,103000,2023-07-15,ML,nine,Seattle,Senior ML Engineer
|
| 10 |
+
9,Lisa Anderson,26,lisa.anderson@tech.com,88000,2023-08-01,DataScience,8.9,Boston,Data Engineer
|
| 11 |
+
10,,thirty,maria@techcorp.com,94000,2023-09-10,AI,8.6,New York,Engineer
|
| 12 |
+
11,JOHN MARTINEZ, ,john.m@company.com,98000,12th Oct 2023,Artificial Intelligence,8.2,Chicago,Senior Engineer
|
| 13 |
+
12,Amanda White,29,amanda@techcorp.com, 102000 ,2023-11-01,AI/ML,9.1,San Francisco,Tech Lead
|
| 14 |
+
13,Chris Taylor,33,chris@@company.com,115000,2023-12-01,data science,8.8,Seattle,Principal Analyst
|
| 15 |
+
14,Jessica Moore,twenty five,jessica@techcorp.com,85000USD,2024-01-05,AI,7.9,Boston,Junior Engineer
|
| 16 |
+
15,daniel garcia,30,daniel at company.com,99000,2024-02-01,MachineLearning ,8.4,NYC,ML Engineer
|
| 17 |
+
16,PATRICIA THOMAS,34,patricia@tech,120000,2024-02-15,AI,9.3,San Francisco,Staff Engineer
|
| 18 |
+
17,Ryan Harris,28,ryan.h@techcorp.com,96000,10/03/2024,Data Science,8.5,Chicago,Senior Analyst
|
| 19 |
+
18,Michelle Clark,27,michelle@company.com,$91000,2024-03-20,ai,8.1,New York,Engineer
|
| 20 |
+
19,Kevin Rodriguez, 29 ,kevin@@techcorp.com,105000,01-04-2024,ML,9.0,Seattle,Senior Engineer
|
| 21 |
+
20,Laura Lewis,twenty six,laura.lewis@tech.com,87000,2024-04-15,DataScience,7.7,Boston,Data Analyst
|
| 22 |
+
21, Mark Walker ,32,mark@techcorp.com,NaN,2024-05-01,AI,8.7,San Francisco,Senior Engineer
|
| 23 |
+
22,Angela Hall,28,angela at gmail.com,93000 ,15/05/2024,Data science,eight,NYC,Data Scientist
|
| 24 |
+
23,Thomas Allen,35,thomas@company,112000,2024/06/01,Artificial Intelligence,9.4,Chicago,Principal Engineer
|
| 25 |
+
24,Karen Young,27,karen.y@techcorp.com,89000,05-06-2024,ai,8.3,Seattle,Engineer
|
| 26 |
+
25,Steven King,thirty one,steven@@tech.com,107000,2024-07-10,Machine Learning,8.9,Boston,ML Engineer
|
| 27 |
+
26,DONNA WRIGHT,29,donna@techcorp.com,$98000,2024-07-20,AI/ML,8.6,San Francisco,Tech Lead
|
| 28 |
+
27,Jason Lopez,26,jason.lopez@company.com, 85000 ,2024-08-01,Data Science,7.8,New York,Junior Analyst
|
| 29 |
+
28,Nancy Hill,thirty,nancy at techcorp.com,101000,15th Aug 2024,ai,9.1,Chicago,Senior Engineer
|
| 30 |
+
29,paul scott,28,paul@tech,95000,2024/09/01,DataScience,8.4,Seattle,Data Engineer
|
| 31 |
+
30,,27,betty@techcorp.com,88000,2024-09-15,ML,8.2,Boston,Engineer
|
| 32 |
+
31,Mark Green,33,mark.g@@company.com,118000USD,01-10-2024,Artificial Intelligence,9.2,San Francisco,Staff Engineer
|
| 33 |
+
32,Sandra Adams,twenty eight,sandra@techcorp.com,92000,2024-10-10,AI,8.0,NYC,Engineer
|
| 34 |
+
33,GEORGE BAKER,29,george at company.com, 106000,2024/10/20,Data science,8.8,Chicago,Senior Analyst
|
| 35 |
+
34,Carol Gonzalez,31,carol@tech.com,$99000,2024-11-01,Machine Learning,8.5,Seattle,ML Engineer
|
| 36 |
+
35,Brian Nelson,26,brian@@techcorp.com,NaN,2024-11-15,ai,7.9,Boston,Junior Engineer
|
| 37 |
+
36, Susan Carter ,thirty two,susan@company.com,114000,01-12-2024,AI/ML,9.3,San Francisco,Principal Engineer
|
| 38 |
+
37,edward mitchell,27,edward.m@techcorp.com,90000 ,15/12/2024,DataScience,8.3,New York,Data Scientist
|
| 39 |
+
38,Margaret Perez,28,margaret at tech.com,97000,2024/12/20,AI,8.7,Chicago,Senior Engineer
|
| 40 |
+
39,JOSEPH ROBERTS,thirty,joseph@@company.com,108000,2025-01-05,Data Science,nine,Seattle,Tech Lead
|
| 41 |
+
40,Lisa Turner,29,lisa.t@techcorp,103000,2025-01-15,Machine Learning,8.6,Boston,Senior ML Engineer
|
| 42 |
+
41,Frank Phillips,27,frank@techcorp.com,$94000,20/01/2025,ai,8.1,San Francisco,Engineer
|
| 43 |
+
42,Helen Campbell, 26 ,helen at company.com,86000,2025/02/01,DataScience,7.8,NYC,Data Analyst
|
| 44 |
+
43,matthew parker,31,matthew@@tech.com, 111000 ,05-02-2025,Artificial Intelligence,9.0,Chicago,Principal Analyst
|
| 45 |
+
44,,twenty nine,elizabeth@techcorp.com,100000USD,2025-02-15,AI/ML,8.8,Seattle,Tech Lead
|
| 46 |
+
45,ANTHONY EVANS,28,anthony@company,96000,2025/03/01,Data science,8.4,Boston,Senior Analyst
|
| 47 |
+
46, Barbara Edwards ,30,barbara.e@techcorp.com,NaN,10th Mar 2025,Machine Learning,8.9,San Francisco,ML Engineer
|
| 48 |
+
47,kenneth collins,27,kenneth at tech.com,91000 ,2025-03-20,ai,8.2,New York,Engineer
|
| 49 |
+
48,Dorothy Stewart,thirty three,dorothy@@company.com,$104000,01-04-2025,DataScience,9.1,Chicago,Senior Data Scientist
|
| 50 |
+
49,ANDREW SANCHEZ,26,andrew@techcorp.com,87000,2025/04/10,AI,7.7,Seattle,Junior Engineer
|
| 51 |
+
50,donna morris,29,donna.m at company.com, 99000 ,2025-04-20,Data Science,8.5,Boston,Data Analyst
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.31
|
| 2 |
+
pandas>=2.0
|
| 3 |
+
python-dateutil>=2.8
|
| 4 |
+
openai>=1.0.0
|
| 5 |
+
pydantic>=2.0
|
| 6 |
+
rapidfuzz>=3.0
|
| 7 |
+
langgraph>=0.2.0
|
| 8 |
+
pytest>=8.0
|
src/agent/graph.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import math
|
| 5 |
+
from typing import Any, Dict, List, Optional, TypedDict
|
| 6 |
+
|
| 7 |
+
from langgraph.graph import StateGraph, END
|
| 8 |
+
from pydantic import ValidationError
|
| 9 |
+
|
| 10 |
+
from src.agent.schemas import ExtractedData
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# ---------- JSON safety ----------
|
| 14 |
+
def _json_safe(obj):
|
| 15 |
+
"""Recursively convert NaN/inf to None so payload becomes valid JSON."""
|
| 16 |
+
if obj is None:
|
| 17 |
+
return None
|
| 18 |
+
if isinstance(obj, float):
|
| 19 |
+
if math.isnan(obj) or math.isinf(obj):
|
| 20 |
+
return None
|
| 21 |
+
return obj
|
| 22 |
+
if isinstance(obj, dict):
|
| 23 |
+
return {k: _json_safe(v) for k, v in obj.items()}
|
| 24 |
+
if isinstance(obj, list):
|
| 25 |
+
return [_json_safe(v) for v in obj]
|
| 26 |
+
return obj
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ---------- OpenAI helper ----------
|
| 30 |
+
def _openai_client(api_key: str):
|
| 31 |
+
from openai import OpenAI
|
| 32 |
+
|
| 33 |
+
# robust for flaky networks
|
| 34 |
+
return OpenAI(api_key=api_key, timeout=60, max_retries=5)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
EXTRACT_SYSTEM = """You are a data extraction + validation agent.
|
| 38 |
+
|
| 39 |
+
Your job: convert messy text into STRICT JSON that matches this schema:
|
| 40 |
+
|
| 41 |
+
{
|
| 42 |
+
"employees": [
|
| 43 |
+
{
|
| 44 |
+
"user_id": int,
|
| 45 |
+
"name": string,
|
| 46 |
+
"age": int|null,
|
| 47 |
+
"email": string|null,
|
| 48 |
+
"salary": number|null,
|
| 49 |
+
"join_date": "YYYY-MM-DD"|null,
|
| 50 |
+
"department": one of ["Artificial Intelligence","AI/ML","Machine Learning","Data Science"],
|
| 51 |
+
"performance_score": number|null (0..10),
|
| 52 |
+
"location": string|null,
|
| 53 |
+
"job_title": string|null
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"rejected": [
|
| 57 |
+
{ "raw_record": string, "reasons": [string, ...] }
|
| 58 |
+
]
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
CRITICAL RULES (NO HALLUCINATION):
|
| 62 |
+
- NEVER invent user_id. If user_id is missing/uncertain, DO NOT guess.
|
| 63 |
+
Put that record into "rejected" with reason "missing user_id".
|
| 64 |
+
- NEVER guess values from vague text like "maybe", "around", "probably", "approx".
|
| 65 |
+
Use null for uncertain optional fields.
|
| 66 |
+
- If a record cannot be made schema-valid WITHOUT guessing required fields, reject it.
|
| 67 |
+
- Do not fabricate emails or domains. If email is invalid -> null (or reject only if required, but email is optional here).
|
| 68 |
+
|
| 69 |
+
Normalization rules:
|
| 70 |
+
- Output JSON ONLY, no markdown.
|
| 71 |
+
- If a field is missing, set it to null (not empty string).
|
| 72 |
+
- Normalize department values:
|
| 73 |
+
AI/ai/Artificial Intelligence -> "Artificial Intelligence"
|
| 74 |
+
AI/ML -> "AI/ML"
|
| 75 |
+
ML/Machine Learning -> "Machine Learning"
|
| 76 |
+
DataScience/Data science -> "Data Science"
|
| 77 |
+
- Convert word numbers (e.g., "twenty nine") to integers when clear.
|
| 78 |
+
- Convert dates to ISO YYYY-MM-DD if possible, else null.
|
| 79 |
+
- Salary: remove $ and commas; if missing, null.
|
| 80 |
+
- performance_score must be 0..10; if value is out of range or unclear -> null.
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
CORRECT_SYSTEM = """You are a self-correcting data validation agent.
|
| 84 |
+
|
| 85 |
+
You will be given:
|
| 86 |
+
- the previous JSON you produced
|
| 87 |
+
- a validation error message describing why it failed
|
| 88 |
+
|
| 89 |
+
Fix the JSON to satisfy the schema.
|
| 90 |
+
|
| 91 |
+
CRITICAL RULES (NO HALLUCINATION):
|
| 92 |
+
- NEVER invent user_id. If user_id is missing/uncertain, reject the record instead of guessing.
|
| 93 |
+
- NEVER guess uncertain values (maybe/around/probably). Use null for optional fields.
|
| 94 |
+
- Prefer moving problematic records to "rejected" with clear reasons rather than fabricating data.
|
| 95 |
+
|
| 96 |
+
Rules:
|
| 97 |
+
- Output JSON ONLY.
|
| 98 |
+
- Keep valid records in "employees".
|
| 99 |
+
- Put non-fixable records in "rejected" with reasons.
|
| 100 |
+
- Use null for missing fields (not empty strings).
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# ---------- LangGraph State ----------
|
| 105 |
+
class AgentState(TypedDict):
|
| 106 |
+
raw_text: str
|
| 107 |
+
attempt: int
|
| 108 |
+
max_attempts: int
|
| 109 |
+
last_json_text: str
|
| 110 |
+
validation_error: str
|
| 111 |
+
result: Optional[Dict[str, Any]]
|
| 112 |
+
log: List[Dict[str, Any]]
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _llm_extract(state: AgentState, api_key: str, model: str) -> AgentState:
|
| 116 |
+
client = _openai_client(api_key)
|
| 117 |
+
payload = {"raw_text": state["raw_text"]}
|
| 118 |
+
|
| 119 |
+
resp = client.responses.create(
|
| 120 |
+
model=model,
|
| 121 |
+
input=[
|
| 122 |
+
{"role": "system", "content": EXTRACT_SYSTEM},
|
| 123 |
+
{"role": "user", "content": json.dumps(payload)},
|
| 124 |
+
],
|
| 125 |
+
temperature=0,
|
| 126 |
+
max_output_tokens=1400,
|
| 127 |
+
)
|
| 128 |
+
out = (resp.output_text or "").strip()
|
| 129 |
+
|
| 130 |
+
state["last_json_text"] = out
|
| 131 |
+
state["log"].append({"step": "extract", "attempt": state["attempt"], "output": out[:2000]})
|
| 132 |
+
return state
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _validate(state: AgentState) -> AgentState:
|
| 136 |
+
try:
|
| 137 |
+
data = ExtractedData.model_validate_json(state["last_json_text"])
|
| 138 |
+
state["result"] = _json_safe(data.model_dump())
|
| 139 |
+
state["validation_error"] = ""
|
| 140 |
+
state["log"].append({"step": "validate", "attempt": state["attempt"], "status": "pass"})
|
| 141 |
+
except ValidationError as e:
|
| 142 |
+
state["result"] = None
|
| 143 |
+
state["validation_error"] = str(e)
|
| 144 |
+
state["log"].append(
|
| 145 |
+
{
|
| 146 |
+
"step": "validate",
|
| 147 |
+
"attempt": state["attempt"],
|
| 148 |
+
"status": "fail",
|
| 149 |
+
"error": state["validation_error"][:2000],
|
| 150 |
+
}
|
| 151 |
+
)
|
| 152 |
+
return state
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _llm_correct(state: AgentState, api_key: str, model: str) -> AgentState:
|
| 156 |
+
client = _openai_client(api_key)
|
| 157 |
+
payload = {
|
| 158 |
+
"previous_json": state["last_json_text"],
|
| 159 |
+
"validation_error": state["validation_error"],
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
resp = client.responses.create(
|
| 163 |
+
model=model,
|
| 164 |
+
input=[
|
| 165 |
+
{"role": "system", "content": CORRECT_SYSTEM},
|
| 166 |
+
{"role": "user", "content": json.dumps(payload)},
|
| 167 |
+
],
|
| 168 |
+
temperature=0,
|
| 169 |
+
max_output_tokens=1400,
|
| 170 |
+
)
|
| 171 |
+
out = (resp.output_text or "").strip()
|
| 172 |
+
|
| 173 |
+
state["last_json_text"] = out
|
| 174 |
+
state["log"].append({"step": "correct", "attempt": state["attempt"], "output": out[:2000]})
|
| 175 |
+
return state
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _should_retry(state: AgentState) -> str:
|
| 179 |
+
if state["result"] is not None:
|
| 180 |
+
return "finalize"
|
| 181 |
+
if state["attempt"] >= state["max_attempts"]:
|
| 182 |
+
return "finalize"
|
| 183 |
+
return "retry"
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def build_graph(api_key: str, model: str):
|
| 187 |
+
g = StateGraph(AgentState)
|
| 188 |
+
|
| 189 |
+
g.add_node("extract", lambda s: _llm_extract(s, api_key, model))
|
| 190 |
+
g.add_node("validate", _validate)
|
| 191 |
+
g.add_node("correct", lambda s: _llm_correct(s, api_key, model))
|
| 192 |
+
|
| 193 |
+
g.set_entry_point("extract")
|
| 194 |
+
g.add_edge("extract", "validate")
|
| 195 |
+
g.add_conditional_edges(
|
| 196 |
+
"validate",
|
| 197 |
+
_should_retry,
|
| 198 |
+
{"retry": "correct", "finalize": END},
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# after correcting, increment attempt then validate again
|
| 202 |
+
def inc_attempt(state: AgentState) -> AgentState:
|
| 203 |
+
state["attempt"] += 1
|
| 204 |
+
return state
|
| 205 |
+
|
| 206 |
+
g.add_node("inc_attempt", inc_attempt)
|
| 207 |
+
g.add_edge("correct", "inc_attempt")
|
| 208 |
+
g.add_edge("inc_attempt", "validate")
|
| 209 |
+
|
| 210 |
+
return g.compile()
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def run_agent(raw_text: str, api_key: str, model: str = "gpt-4.1-mini", max_attempts: int = 3):
|
| 214 |
+
graph = build_graph(api_key, model)
|
| 215 |
+
init: AgentState = {
|
| 216 |
+
"raw_text": raw_text,
|
| 217 |
+
"attempt": 1,
|
| 218 |
+
"max_attempts": max_attempts,
|
| 219 |
+
"last_json_text": "",
|
| 220 |
+
"validation_error": "",
|
| 221 |
+
"result": None,
|
| 222 |
+
"log": [],
|
| 223 |
+
}
|
| 224 |
+
final_state = graph.invoke(init)
|
| 225 |
+
return final_state
|
src/agent/offline_runner.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from typing import List, Dict, Any, Optional
|
| 3 |
+
from pydantic import ValidationError
|
| 4 |
+
from src.agent.schemas import ExtractedData
|
| 5 |
+
|
| 6 |
+
def run_offline_agent(raw_text: str, candidate_json_outputs: List[str], max_attempts: int = 3):
|
| 7 |
+
"""
|
| 8 |
+
Offline testing: instead of calling OpenAI, we feed the agent JSON outputs
|
| 9 |
+
and see if validation passes, and how many attempts it takes.
|
| 10 |
+
"""
|
| 11 |
+
log = []
|
| 12 |
+
last = ""
|
| 13 |
+
attempt = 1
|
| 14 |
+
|
| 15 |
+
for out in candidate_json_outputs[:max_attempts]:
|
| 16 |
+
last = out
|
| 17 |
+
log.append({"step": "extract", "attempt": attempt, "output": out})
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
data = ExtractedData.model_validate_json(out)
|
| 21 |
+
log.append({"step": "validate", "attempt": attempt, "status": "pass"})
|
| 22 |
+
return {"result": data.model_dump(), "log": log, "last_json_text": last}
|
| 23 |
+
except ValidationError as e:
|
| 24 |
+
log.append({"step": "validate", "attempt": attempt, "status": "fail", "error": str(e)})
|
| 25 |
+
attempt += 1
|
| 26 |
+
|
| 27 |
+
return {"result": None, "log": log, "last_json_text": last}
|
src/agent/schemas.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from datetime import date
|
| 4 |
+
from typing import List, Optional, Literal
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel, Field, field_validator
|
| 7 |
+
|
| 8 |
+
Department = Literal[
|
| 9 |
+
"Artificial Intelligence",
|
| 10 |
+
"AI/ML",
|
| 11 |
+
"Machine Learning",
|
| 12 |
+
"Data Science",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class Employee(BaseModel):
|
| 17 |
+
user_id: int
|
| 18 |
+
name: str = Field(min_length=1)
|
| 19 |
+
age: Optional[int] = Field(default=None, ge=16, le=80)
|
| 20 |
+
email: Optional[str] = None
|
| 21 |
+
salary: Optional[float] = Field(default=None, ge=0)
|
| 22 |
+
join_date: Optional[date] = None
|
| 23 |
+
department: Department
|
| 24 |
+
performance_score: Optional[float] = Field(default=None, ge=0, le=10)
|
| 25 |
+
location: Optional[str] = None
|
| 26 |
+
job_title: Optional[str] = None
|
| 27 |
+
|
| 28 |
+
@field_validator("name")
|
| 29 |
+
@classmethod
|
| 30 |
+
def normalize_name(cls, v: str) -> str:
|
| 31 |
+
v = " ".join(str(v).strip().split())
|
| 32 |
+
return v.title()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class RejectedRecord(BaseModel):
|
| 36 |
+
raw_record: str = Field(min_length=1)
|
| 37 |
+
reasons: List[str] = Field(min_length=1)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class ExtractedData(BaseModel):
|
| 41 |
+
# Valid, schema-compliant records only
|
| 42 |
+
employees: List[Employee] = Field(default_factory=list)
|
| 43 |
+
|
| 44 |
+
# Records that cannot be made valid WITHOUT guessing required fields
|
| 45 |
+
rejected: List[RejectedRecord] = Field(default_factory=list)
|
src/core/cleaning.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Dict, List, Tuple, Optional
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from dateutil import parser as dtparser
|
| 9 |
+
from rapidfuzz import process, fuzz
|
| 10 |
+
|
| 11 |
+
WORD_NUMS = {
|
| 12 |
+
"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
|
| 13 |
+
"six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
|
| 14 |
+
"eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14,
|
| 15 |
+
"fifteen": 15, "sixteen": 16, "seventeen": 17, "eighteen": 18, "nineteen": 19,
|
| 16 |
+
"twenty": 20, "thirty": 30, "forty": 40, "fifty": 50, "sixty": 60
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
DEPT_CANON = {
|
| 20 |
+
"ai": "Artificial Intelligence",
|
| 21 |
+
"artificial intelligence": "Artificial Intelligence",
|
| 22 |
+
"ai/ml": "AI/ML",
|
| 23 |
+
"ml": "Machine Learning",
|
| 24 |
+
"machine learning": "Machine Learning",
|
| 25 |
+
"data science": "Data Science",
|
| 26 |
+
"datascience": "Data Science",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
LOCATION_CANON = {
|
| 30 |
+
"nyc": "New York",
|
| 31 |
+
"new york": "New York",
|
| 32 |
+
"san francisco": "San Francisco",
|
| 33 |
+
"chicago": "Chicago",
|
| 34 |
+
"seattle": "Seattle",
|
| 35 |
+
"boston": "Boston",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$")
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class CleaningReport:
|
| 42 |
+
rows: int
|
| 43 |
+
fixes: Dict[str, int]
|
| 44 |
+
warnings: List[str]
|
| 45 |
+
|
| 46 |
+
def _clean_name(s: str) -> Optional[str]:
|
| 47 |
+
if s is None or pd.isna(s):
|
| 48 |
+
return None
|
| 49 |
+
s = str(s).strip()
|
| 50 |
+
s = re.sub(r"\s+", " ", s)
|
| 51 |
+
s = " ".join([w.capitalize() for w in s.split()])
|
| 52 |
+
return s if s else None
|
| 53 |
+
|
| 54 |
+
def _parse_word_number(s: str) -> Optional[int]:
|
| 55 |
+
if s is None:
|
| 56 |
+
return None
|
| 57 |
+
t = str(s).strip().lower()
|
| 58 |
+
if t == "":
|
| 59 |
+
return None
|
| 60 |
+
if re.fullmatch(r"\d+", t):
|
| 61 |
+
return int(t)
|
| 62 |
+
parts = re.split(r"[\s\-]+", t)
|
| 63 |
+
total = 0
|
| 64 |
+
matched = False
|
| 65 |
+
for p in parts:
|
| 66 |
+
if p in WORD_NUMS:
|
| 67 |
+
total += WORD_NUMS[p]
|
| 68 |
+
matched = True
|
| 69 |
+
else:
|
| 70 |
+
return None
|
| 71 |
+
return total if matched else None
|
| 72 |
+
|
| 73 |
+
def _clean_email(s: str) -> Tuple[str, bool]:
|
| 74 |
+
if s is None:
|
| 75 |
+
return "unknown@unknown.com", True
|
| 76 |
+
t = str(s).strip().lower()
|
| 77 |
+
if t == "":
|
| 78 |
+
return "unknown@unknown.com", True
|
| 79 |
+
t = t.replace(" at ", "@").replace(" dot ", ".")
|
| 80 |
+
t = t.replace("..", ".")
|
| 81 |
+
t = re.sub(r"@{2,}", "@", t)
|
| 82 |
+
if EMAIL_RE.match(t):
|
| 83 |
+
return t, (t != str(s).strip().lower())
|
| 84 |
+
if "@" in t and "." not in t.split("@", 1)[1]:
|
| 85 |
+
return t, False
|
| 86 |
+
return t, False
|
| 87 |
+
|
| 88 |
+
def _clean_salary(s: str) -> Tuple[Optional[float], bool]:
|
| 89 |
+
if s is None:
|
| 90 |
+
return None, False
|
| 91 |
+
t = str(s).strip()
|
| 92 |
+
if t == "" or t.lower() == "nan":
|
| 93 |
+
return None, False
|
| 94 |
+
t2 = re.sub(r"[,$]", "", t)
|
| 95 |
+
t2 = re.sub(r"usd", "", t2, flags=re.I).strip()
|
| 96 |
+
try:
|
| 97 |
+
return float(t2), (t2 != t)
|
| 98 |
+
except ValueError:
|
| 99 |
+
return None, False
|
| 100 |
+
|
| 101 |
+
def _clean_date(s: str) -> Tuple[Optional[str], bool]:
|
| 102 |
+
if s is None:
|
| 103 |
+
return None, False
|
| 104 |
+
t = str(s).strip()
|
| 105 |
+
if t == "" or t.lower() == "nan":
|
| 106 |
+
return None, False
|
| 107 |
+
try:
|
| 108 |
+
dt = dtparser.parse(t, dayfirst=False, fuzzy=True)
|
| 109 |
+
iso = dt.date().isoformat()
|
| 110 |
+
return iso, (iso != t)
|
| 111 |
+
except Exception:
|
| 112 |
+
return None, False
|
| 113 |
+
|
| 114 |
+
def _canon_from_map(value: str, mapping: Dict[str, str], threshold: int = 90) -> Tuple[str, bool]:
|
| 115 |
+
raw = (value or "").strip()
|
| 116 |
+
if raw == "":
|
| 117 |
+
return raw, False
|
| 118 |
+
key = raw.lower()
|
| 119 |
+
if key in mapping:
|
| 120 |
+
canon = mapping[key]
|
| 121 |
+
return canon, canon != raw
|
| 122 |
+
match = process.extractOne(key, mapping.keys(), scorer=fuzz.ratio)
|
| 123 |
+
if match and match[1] >= threshold:
|
| 124 |
+
canon = mapping[match[0]]
|
| 125 |
+
return canon, canon != raw
|
| 126 |
+
return raw, False
|
| 127 |
+
|
| 128 |
+
def clean_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningReport]:
|
| 129 |
+
out = df.copy()
|
| 130 |
+
fixes: Dict[str, int] = {}
|
| 131 |
+
warnings: List[str] = []
|
| 132 |
+
|
| 133 |
+
out.columns = [c.strip() for c in out.columns]
|
| 134 |
+
col_map = {c.lower(): c for c in out.columns}
|
| 135 |
+
|
| 136 |
+
name_col = col_map.get("name")
|
| 137 |
+
age_col = col_map.get("age")
|
| 138 |
+
email_col = col_map.get("email")
|
| 139 |
+
salary_col = col_map.get("salary")
|
| 140 |
+
join_col = col_map.get("join_date") or col_map.get("join date")
|
| 141 |
+
dept_col = col_map.get("department")
|
| 142 |
+
perf_col = col_map.get("performance_score") or col_map.get("performance score")
|
| 143 |
+
loc_col = col_map.get("location")
|
| 144 |
+
|
| 145 |
+
if name_col:
|
| 146 |
+
before = out[name_col].astype(str).tolist()
|
| 147 |
+
out[name_col] = out[name_col].apply(_clean_name)
|
| 148 |
+
fixes["name_normalized"] = sum(b != a for b, a in zip(before, out[name_col].tolist()))
|
| 149 |
+
|
| 150 |
+
if age_col:
|
| 151 |
+
def clean_age(x):
|
| 152 |
+
if x is None:
|
| 153 |
+
return None
|
| 154 |
+
t = str(x).strip().lower()
|
| 155 |
+
if t == "":
|
| 156 |
+
return None
|
| 157 |
+
n = _parse_word_number(t)
|
| 158 |
+
if n is not None:
|
| 159 |
+
return n
|
| 160 |
+
try:
|
| 161 |
+
return int(float(t))
|
| 162 |
+
except Exception:
|
| 163 |
+
return None
|
| 164 |
+
before = out[age_col].tolist()
|
| 165 |
+
out[age_col] = out[age_col].apply(clean_age)
|
| 166 |
+
fixes["age_parsed"] = sum(b != a for b, a in zip(before, out[age_col].tolist()))
|
| 167 |
+
|
| 168 |
+
if email_col:
|
| 169 |
+
before = out[email_col].tolist()
|
| 170 |
+
new_vals, changed = [], 0
|
| 171 |
+
for v in before:
|
| 172 |
+
cleaned, did = _clean_email(v)
|
| 173 |
+
if did:
|
| 174 |
+
changed += 1
|
| 175 |
+
new_vals.append(cleaned)
|
| 176 |
+
out[email_col] = new_vals
|
| 177 |
+
fixes["email_cleaned"] = changed
|
| 178 |
+
invalid = [e for e in out[email_col].tolist() if not EMAIL_RE.match(e)]
|
| 179 |
+
if invalid:
|
| 180 |
+
warnings.append(f"{len(invalid)} email(s) still look invalid (e.g., '{invalid[0]}').")
|
| 181 |
+
|
| 182 |
+
if salary_col:
|
| 183 |
+
before = out[salary_col].tolist()
|
| 184 |
+
vals, changed = [], 0
|
| 185 |
+
for v in before:
|
| 186 |
+
s2, did = _clean_salary(v)
|
| 187 |
+
if did:
|
| 188 |
+
changed += 1
|
| 189 |
+
vals.append(s2)
|
| 190 |
+
out[salary_col] = vals
|
| 191 |
+
fixes["salary_cleaned"] = changed
|
| 192 |
+
|
| 193 |
+
if join_col:
|
| 194 |
+
before = out[join_col].tolist()
|
| 195 |
+
vals, changed = [], 0
|
| 196 |
+
for v in before:
|
| 197 |
+
d2, did = _clean_date(v)
|
| 198 |
+
if did:
|
| 199 |
+
changed += 1
|
| 200 |
+
vals.append(d2)
|
| 201 |
+
out[join_col] = vals
|
| 202 |
+
fixes["join_date_normalized"] = changed
|
| 203 |
+
|
| 204 |
+
if dept_col:
|
| 205 |
+
before = out[dept_col].astype(str).tolist()
|
| 206 |
+
new, changed = [], 0
|
| 207 |
+
for v in before:
|
| 208 |
+
canon, did = _canon_from_map(v, DEPT_CANON, threshold=90)
|
| 209 |
+
if did:
|
| 210 |
+
changed += 1
|
| 211 |
+
new.append(canon)
|
| 212 |
+
out[dept_col] = new
|
| 213 |
+
fixes["department_standardized"] = changed
|
| 214 |
+
|
| 215 |
+
if loc_col:
|
| 216 |
+
before = out[loc_col].astype(str).tolist()
|
| 217 |
+
new, changed = [], 0
|
| 218 |
+
for v in before:
|
| 219 |
+
canon, did = _canon_from_map(v, LOCATION_CANON, threshold=88)
|
| 220 |
+
if did:
|
| 221 |
+
changed += 1
|
| 222 |
+
new.append(canon)
|
| 223 |
+
out[loc_col] = new
|
| 224 |
+
fixes["location_standardized"] = changed
|
| 225 |
+
|
| 226 |
+
if perf_col:
|
| 227 |
+
before = out[perf_col].tolist()
|
| 228 |
+
def clean_perf(x):
|
| 229 |
+
if x is None:
|
| 230 |
+
return None
|
| 231 |
+
t = str(x).strip().lower()
|
| 232 |
+
if t == "" or t == "nan":
|
| 233 |
+
return None
|
| 234 |
+
n = _parse_word_number(t)
|
| 235 |
+
if n is not None:
|
| 236 |
+
return float(n)
|
| 237 |
+
try:
|
| 238 |
+
return float(t)
|
| 239 |
+
except Exception:
|
| 240 |
+
return None
|
| 241 |
+
out[perf_col] = out[perf_col].apply(clean_perf)
|
| 242 |
+
fixes["performance_parsed"] = sum(b != a for b, a in zip(before, out[perf_col].tolist()))
|
| 243 |
+
|
| 244 |
+
return out, CleaningReport(rows=len(out), fixes=fixes, warnings=warnings)
|
src/core/query.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any, List
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pydantic import BaseModel, Field, ValidationError
|
| 6 |
+
import math
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class FilterSpec(BaseModel):
|
| 10 |
+
column: str
|
| 11 |
+
op: str = Field(..., description="one of: eq, neq, contains, in, gte, lte")
|
| 12 |
+
value: Any
|
| 13 |
+
|
| 14 |
+
class QuerySpec(BaseModel):
|
| 15 |
+
select: List[str] = Field(default_factory=list)
|
| 16 |
+
filters: List[FilterSpec] = Field(default_factory=list)
|
| 17 |
+
distinct: bool = True
|
| 18 |
+
limit: int = 50
|
| 19 |
+
|
| 20 |
+
SYSTEM_PROMPT = """You are a data query planner.
|
| 21 |
+
You receive:
|
| 22 |
+
- user_question
|
| 23 |
+
- available_columns
|
| 24 |
+
- sample_values (small)
|
| 25 |
+
Return ONLY valid JSON for QuerySpec with:
|
| 26 |
+
- select: columns needed
|
| 27 |
+
- filters: list of {column, op, value}
|
| 28 |
+
- distinct: true/false
|
| 29 |
+
- limit: integer <= 200
|
| 30 |
+
|
| 31 |
+
Rules:
|
| 32 |
+
- Prefer deterministic, simple filters.
|
| 33 |
+
- If the user asks for a department like "Artificial Intelligence", filter Department equals that exact string.
|
| 34 |
+
- If user says AI department, treat it as Department in ["Artificial Intelligence","AI/ML"] unless user explicitly excludes AI/ML.
|
| 35 |
+
- Do NOT invent columns.
|
| 36 |
+
"""
|
| 37 |
+
def _json_safe(obj):
|
| 38 |
+
"""Recursively convert NaN/inf to None so payload becomes valid JSON."""
|
| 39 |
+
if obj is None:
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
if isinstance(obj, float):
|
| 43 |
+
if math.isnan(obj) or math.isinf(obj):
|
| 44 |
+
return None
|
| 45 |
+
return obj
|
| 46 |
+
|
| 47 |
+
if isinstance(obj, dict):
|
| 48 |
+
return {k: _json_safe(v) for k, v in obj.items()}
|
| 49 |
+
|
| 50 |
+
if isinstance(obj, list):
|
| 51 |
+
return [_json_safe(v) for v in obj]
|
| 52 |
+
|
| 53 |
+
return obj
|
| 54 |
+
|
| 55 |
+
def _openai_client(api_key: str):
|
| 56 |
+
from openai import OpenAI
|
| 57 |
+
return OpenAI(api_key=api_key)
|
| 58 |
+
|
| 59 |
+
def plan_query_with_llm(user_question: str, df: pd.DataFrame, api_key: str, model: str = "gpt-4.1-mini") -> QuerySpec:
|
| 60 |
+
cols = list(df.columns)
|
| 61 |
+
sample_df = df.head(8).copy()
|
| 62 |
+
sample_df = sample_df.where(pd.notna(sample_df), None)
|
| 63 |
+
sample = _json_safe(sample_df.to_dict(orient="records"))
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
client = _openai_client(api_key)
|
| 68 |
+
import json # add at top of file if not present
|
| 69 |
+
|
| 70 |
+
payload = {
|
| 71 |
+
"user_question": user_question,
|
| 72 |
+
"available_columns": cols,
|
| 73 |
+
"sample_values": sample,
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
resp = client.responses.create(
|
| 77 |
+
model=model,
|
| 78 |
+
input=[
|
| 79 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 80 |
+
{"role": "user", "content": json.dumps(payload)},
|
| 81 |
+
],
|
| 82 |
+
temperature=0,
|
| 83 |
+
max_output_tokens=600,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
text = resp.output_text
|
| 88 |
+
try:
|
| 89 |
+
spec = QuerySpec.model_validate_json(text)
|
| 90 |
+
spec.limit = max(1, min(int(spec.limit), 200))
|
| 91 |
+
return spec
|
| 92 |
+
except ValidationError as e:
|
| 93 |
+
raise ValueError(f"Could not parse model output as QuerySpec JSON. Raw output:\\n{text}\\n\\nError:\\n{e}")
|
| 94 |
+
|
| 95 |
+
def execute_query(spec: QuerySpec, df: pd.DataFrame) -> pd.DataFrame:
|
| 96 |
+
out = df.copy()
|
| 97 |
+
|
| 98 |
+
for f in spec.filters:
|
| 99 |
+
col = f.column
|
| 100 |
+
if col not in out.columns:
|
| 101 |
+
continue
|
| 102 |
+
op = f.op
|
| 103 |
+
val = f.value
|
| 104 |
+
|
| 105 |
+
if op == "eq":
|
| 106 |
+
if val is None:
|
| 107 |
+
out = out[out[col].isna()]
|
| 108 |
+
else:
|
| 109 |
+
out = out[out[col] == val]
|
| 110 |
+
elif op == "neq":
|
| 111 |
+
if val is None:
|
| 112 |
+
out = out[out[col].notna()]
|
| 113 |
+
else:
|
| 114 |
+
out = out[out[col] != val]
|
| 115 |
+
elif op == "contains":
|
| 116 |
+
out = out[out[col].astype(str).str.contains(str(val), case=False, na=False)]
|
| 117 |
+
elif op == "in":
|
| 118 |
+
if not isinstance(val, list):
|
| 119 |
+
val = [val]
|
| 120 |
+
out = out[out[col].isin(val)]
|
| 121 |
+
elif op == "gte":
|
| 122 |
+
out = out[pd.to_numeric(out[col], errors="coerce") >= float(val)]
|
| 123 |
+
elif op == "lte":
|
| 124 |
+
out = out[pd.to_numeric(out[col], errors="coerce") <= float(val)]
|
| 125 |
+
|
| 126 |
+
if spec.select:
|
| 127 |
+
safe_select = [c for c in spec.select if c in out.columns]
|
| 128 |
+
out = out[safe_select]
|
| 129 |
+
|
| 130 |
+
if spec.distinct:
|
| 131 |
+
out = out.drop_duplicates()
|
| 132 |
+
|
| 133 |
+
return out.head(spec.limit)
|
| 134 |
+
|
| 135 |
+
def summarize_results_with_llm(user_question: str, result_df: pd.DataFrame, api_key: str, model: str = "gpt-4.1-mini") -> str:
|
| 136 |
+
client = _openai_client(api_key)
|
| 137 |
+
safe_df = result_df.copy().where(pd.notna(result_df), None)
|
| 138 |
+
preview = _json_safe(safe_df.to_dict(orient="records"))
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
import json # add at top if not present
|
| 142 |
+
|
| 143 |
+
payload = {"question": user_question, "results": preview}
|
| 144 |
+
|
| 145 |
+
resp = client.responses.create(
|
| 146 |
+
model=model,
|
| 147 |
+
input=[
|
| 148 |
+
{"role": "system", "content": "You are a helpful analyst. Summarize results concisely and accurately."},
|
| 149 |
+
{"role": "user", "content": json.dumps(payload)},
|
| 150 |
+
],
|
| 151 |
+
temperature=0.2,
|
| 152 |
+
max_output_tokens=500,
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
return resp.output_text
|
src/core/security.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from typing import Tuple
|
| 5 |
+
|
| 6 |
+
INJECTION_PATTERNS = [
|
| 7 |
+
r"ignore (all|any) previous",
|
| 8 |
+
r"system prompt",
|
| 9 |
+
r"reveal.*(key|secret|token)",
|
| 10 |
+
r"exfiltrat",
|
| 11 |
+
r"prompt injection",
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
def basic_injection_check(user_text: str) -> Tuple[bool, str]:
|
| 15 |
+
t = (user_text or "").lower()
|
| 16 |
+
for pat in INJECTION_PATTERNS:
|
| 17 |
+
if re.search(pat, t):
|
| 18 |
+
return True, "That request looks like a prompt-injection attempt. I can only answer questions about the uploaded dataset."
|
| 19 |
+
return False, ""
|
src/eval/benchmarks.json
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset_expectations": {
|
| 3 |
+
"notes": "These benchmarks assume a cleaned dataset with standardized Department values."
|
| 4 |
+
},
|
| 5 |
+
"cases": [
|
| 6 |
+
{
|
| 7 |
+
"id": "ai_dept_names_including_aiml",
|
| 8 |
+
"question": "Can you tell me the name of users working in Artificial Intelligence department from all the data entries?",
|
| 9 |
+
"mode": "spec",
|
| 10 |
+
"spec": {
|
| 11 |
+
"select": [
|
| 12 |
+
"Name"
|
| 13 |
+
],
|
| 14 |
+
"filters": [
|
| 15 |
+
{
|
| 16 |
+
"column": "Department",
|
| 17 |
+
"op": "in",
|
| 18 |
+
"value": [
|
| 19 |
+
"Artificial Intelligence",
|
| 20 |
+
"AI/ML"
|
| 21 |
+
]
|
| 22 |
+
}
|
| 23 |
+
],
|
| 24 |
+
"distinct": true,
|
| 25 |
+
"limit": 200
|
| 26 |
+
},
|
| 27 |
+
"expected": {
|
| 28 |
+
"type": "set_equals",
|
| 29 |
+
"column": "Name",
|
| 30 |
+
"values": [
|
| 31 |
+
"Sarah Johnson",
|
| 32 |
+
"Emily Davis",
|
| 33 |
+
"Robert Brown",
|
| 34 |
+
"John Martinez",
|
| 35 |
+
"Amanda White",
|
| 36 |
+
"Jessica Moore",
|
| 37 |
+
"Patricia Thomas",
|
| 38 |
+
"Michelle Clark",
|
| 39 |
+
"Mark Walker",
|
| 40 |
+
"Thomas Allen",
|
| 41 |
+
"Karen Young",
|
| 42 |
+
"Donna Wright",
|
| 43 |
+
"Nancy Hill",
|
| 44 |
+
"Mark Green",
|
| 45 |
+
"Sandra Adams",
|
| 46 |
+
"Brian Nelson",
|
| 47 |
+
"Susan Carter",
|
| 48 |
+
"Margaret Perez",
|
| 49 |
+
"Frank Phillips",
|
| 50 |
+
"Matthew Parker",
|
| 51 |
+
"Kenneth Collins",
|
| 52 |
+
"Andrew Sanchez"
|
| 53 |
+
]
|
| 54 |
+
}
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"id": "high_performers_ge_9",
|
| 58 |
+
"question": "Show employees with performance score >= 9",
|
| 59 |
+
"mode": "spec",
|
| 60 |
+
"spec": {
|
| 61 |
+
"select": [
|
| 62 |
+
"Name",
|
| 63 |
+
"Department",
|
| 64 |
+
"Performance_Score"
|
| 65 |
+
],
|
| 66 |
+
"filters": [
|
| 67 |
+
{
|
| 68 |
+
"column": "Performance_Score",
|
| 69 |
+
"op": "gte",
|
| 70 |
+
"value": 9
|
| 71 |
+
}
|
| 72 |
+
],
|
| 73 |
+
"distinct": true,
|
| 74 |
+
"limit": 200
|
| 75 |
+
},
|
| 76 |
+
"expected": {
|
| 77 |
+
"type": "row_count_gte",
|
| 78 |
+
"min_rows": 1
|
| 79 |
+
}
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"id": "salary_missing",
|
| 83 |
+
"question": "How many employees have missing salary?",
|
| 84 |
+
"mode": "spec",
|
| 85 |
+
"spec": {
|
| 86 |
+
"select": [
|
| 87 |
+
"Salary"
|
| 88 |
+
],
|
| 89 |
+
"filters": [
|
| 90 |
+
{
|
| 91 |
+
"column": "Salary",
|
| 92 |
+
"op": "eq",
|
| 93 |
+
"value": null
|
| 94 |
+
}
|
| 95 |
+
],
|
| 96 |
+
"distinct": false,
|
| 97 |
+
"limit": 1000
|
| 98 |
+
},
|
| 99 |
+
"expected": {
|
| 100 |
+
"type": "row_count_gte",
|
| 101 |
+
"min_rows": 1
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
]
|
| 105 |
+
}
|
src/eval/run_agent_suite.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Run the agent on all files in test_inputs/ and print a clear summary.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
export OPENAI_API_KEY="sk-..."
|
| 6 |
+
python -m src.eval.run_agent_suite
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
from statistics import mean
|
| 13 |
+
|
| 14 |
+
from src.agent.graph import run_agent
|
| 15 |
+
|
| 16 |
+
TEST_DIR = Path("test_inputs")
|
| 17 |
+
MODEL = "gpt-4.1-mini"
|
| 18 |
+
MAX_ATTEMPTS = 4
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def main():
|
| 22 |
+
api_key = os.environ.get("OPENAI_API_KEY", "").strip()
|
| 23 |
+
if not api_key:
|
| 24 |
+
raise SystemExit(
|
| 25 |
+
"OPENAI_API_KEY is not set. Export your key before running the suite:\n\n"
|
| 26 |
+
"export OPENAI_API_KEY=\"sk-...\""
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
inputs = sorted(TEST_DIR.glob("*.txt"))
|
| 30 |
+
if not inputs:
|
| 31 |
+
raise SystemExit("No test_inputs/*.txt files found. Create the sample inputs first.")
|
| 32 |
+
|
| 33 |
+
total = 0
|
| 34 |
+
passed_count = 0
|
| 35 |
+
correct_handling = 0
|
| 36 |
+
attempts_hist = []
|
| 37 |
+
attempts_hist_for_passed = []
|
| 38 |
+
|
| 39 |
+
# OPTIONAL: define expected behavior per file (True=expected PASS/valid handling)
|
| 40 |
+
# For correctness metric we will consider "handled correctly" as either:
|
| 41 |
+
# - produced employees & they match expectations (not available here), or
|
| 42 |
+
# - produced 0 employees and non-empty rejected (for cases that should be rejected)
|
| 43 |
+
# You can expand expected_outcomes if you want to mark specific cases as expected_fail, etc.
|
| 44 |
+
expected_outcomes = {
|
| 45 |
+
# "case09_no_ids.txt": "reject", # example: expected to reject (no user_id)
|
| 46 |
+
# "case15_extreme_noise.txt": "reject",
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
for p in inputs:
|
| 50 |
+
total += 1
|
| 51 |
+
raw = p.read_text()
|
| 52 |
+
print(f"\n=== Running: {p.name} ===")
|
| 53 |
+
try:
|
| 54 |
+
final = run_agent(raw, api_key=api_key, model=MODEL, max_attempts=MAX_ATTEMPTS)
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"[ERROR] agent crashed for {p.name}: {e}")
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
result = final.get("result")
|
| 60 |
+
log = final.get("log", [])
|
| 61 |
+
|
| 62 |
+
# attempts used is the max attempt number seen in log, fall back to 0
|
| 63 |
+
attempts_used = max((entry.get("attempt", 0) for entry in log), default=0)
|
| 64 |
+
|
| 65 |
+
employees_n = 0
|
| 66 |
+
rejected_n = 0
|
| 67 |
+
if result:
|
| 68 |
+
employees_n = len(result.get("employees", []))
|
| 69 |
+
rejected_n = len(result.get("rejected", []))
|
| 70 |
+
|
| 71 |
+
# Define pass = result is not None (valid schema produced)
|
| 72 |
+
passed = result is not None
|
| 73 |
+
|
| 74 |
+
# Print a concise line
|
| 75 |
+
print(f"{p.name}: {'PASS' if passed else 'FAIL'} | attempts={attempts_used} | employees={employees_n} | rejected={rejected_n}")
|
| 76 |
+
|
| 77 |
+
# Print extra info for failures or suspicious cases
|
| 78 |
+
if not passed:
|
| 79 |
+
print("-> Agent failed to produce schema-valid JSON within retry limit.")
|
| 80 |
+
print("Last JSON attempt:")
|
| 81 |
+
print(final.get("last_json_text", ""))
|
| 82 |
+
else:
|
| 83 |
+
# Optionally print the JSON for inspection of suspicious cases
|
| 84 |
+
if employees_n == 0 and rejected_n > 0:
|
| 85 |
+
print("-> No valid employees extracted; records were rejected (no hallucination).")
|
| 86 |
+
# You can uncomment to always show the JSON
|
| 87 |
+
# print(json.dumps(result, indent=2))
|
| 88 |
+
|
| 89 |
+
# Evaluate "correct handling" heuristically:
|
| 90 |
+
# if expected_outcomes says "reject" and the agent indeed rejected (employees==0 and rejected>0) -> correct
|
| 91 |
+
expected = expected_outcomes.get(p.name)
|
| 92 |
+
handled_correctly = False
|
| 93 |
+
if expected == "reject":
|
| 94 |
+
handled_correctly = (employees_n == 0 and rejected_n > 0)
|
| 95 |
+
else:
|
| 96 |
+
# default heuristic: producing a schema result is considered handling (but inspect counts)
|
| 97 |
+
handled_correctly = passed
|
| 98 |
+
|
| 99 |
+
if handled_correctly:
|
| 100 |
+
correct_handling += 1
|
| 101 |
+
|
| 102 |
+
if passed:
|
| 103 |
+
passed_count += 1
|
| 104 |
+
attempts_hist_for_passed.append(attempts_used)
|
| 105 |
+
attempts_hist.append(attempts_used)
|
| 106 |
+
|
| 107 |
+
# Summary
|
| 108 |
+
print("\n=== SUITE SUMMARY ===")
|
| 109 |
+
print(f"Total cases: {total}")
|
| 110 |
+
print(f"Schema-valid produced (pass): {passed_count}/{total} = {passed_count/total:.2%}")
|
| 111 |
+
print(f"Correct-handling (heuristic expected): {correct_handling}/{total} = {correct_handling/total:.2%}")
|
| 112 |
+
if attempts_hist:
|
| 113 |
+
print(f"Avg attempts (all cases): {mean(attempts_hist):.2f}")
|
| 114 |
+
if attempts_hist_for_passed:
|
| 115 |
+
print(f"Avg attempts (passed cases): {mean(attempts_hist_for_passed):.2f}")
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
main()
|
src/eval/run_eval.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
from src.core.cleaning import clean_dataframe
|
| 13 |
+
from src.core.query import QuerySpec, FilterSpec, execute_query, plan_query_with_llm
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class CaseResult:
|
| 18 |
+
case_id: str
|
| 19 |
+
passed: bool
|
| 20 |
+
details: str
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _load_benchmarks(path: Path) -> Dict[str, Any]:
|
| 24 |
+
return json.loads(path.read_text())
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _spec_from_dict(d: Dict[str, Any]) -> QuerySpec:
|
| 28 |
+
filters = [FilterSpec(**f) for f in d.get("filters", [])]
|
| 29 |
+
return QuerySpec(
|
| 30 |
+
select=d.get("select", []),
|
| 31 |
+
filters=filters,
|
| 32 |
+
distinct=bool(d.get("distinct", True)),
|
| 33 |
+
limit=int(d.get("limit", 50)),
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _check_expected(result_df: pd.DataFrame, expected: Dict[str, Any]) -> Tuple[bool, str]:
|
| 38 |
+
et = expected.get("type")
|
| 39 |
+
if et == "set_equals":
|
| 40 |
+
col = expected["column"]
|
| 41 |
+
want = set(expected["values"])
|
| 42 |
+
if col not in result_df.columns:
|
| 43 |
+
return False, f"Missing expected column '{col}'. Columns: {list(result_df.columns)}"
|
| 44 |
+
got = set([x for x in result_df[col].dropna().astype(str).tolist()])
|
| 45 |
+
missing = want - got
|
| 46 |
+
extra = got - want
|
| 47 |
+
if missing or extra:
|
| 48 |
+
return False, f"Set mismatch. Missing={sorted(missing)} Extra={sorted(extra)}"
|
| 49 |
+
return True, "OK"
|
| 50 |
+
if et == "row_count_gte":
|
| 51 |
+
min_rows = int(expected["min_rows"])
|
| 52 |
+
n = len(result_df)
|
| 53 |
+
return (n >= min_rows), f"Rows={n}, expected >= {min_rows}"
|
| 54 |
+
if et == "row_count_equals":
|
| 55 |
+
want = int(expected["rows"])
|
| 56 |
+
n = len(result_df)
|
| 57 |
+
return (n == want), f"Rows={n}, expected == {want}"
|
| 58 |
+
return False, f"Unknown expected.type '{et}'"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def run(args: argparse.Namespace) -> int:
|
| 62 |
+
bench = _load_benchmarks(Path(args.benchmarks))
|
| 63 |
+
df_raw = pd.read_csv(args.csv)
|
| 64 |
+
df, report = clean_dataframe(df_raw)
|
| 65 |
+
|
| 66 |
+
results: List[CaseResult] = []
|
| 67 |
+
|
| 68 |
+
for case in bench["cases"]:
|
| 69 |
+
cid = case["id"]
|
| 70 |
+
mode = case.get("mode", "spec")
|
| 71 |
+
expected = case["expected"]
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
if mode == "spec":
|
| 75 |
+
spec = _spec_from_dict(case["spec"])
|
| 76 |
+
elif mode == "llm":
|
| 77 |
+
if not args.api_key and not os.getenv("OPENAI_API_KEY"):
|
| 78 |
+
results.append(CaseResult(cid, False, "No API key for LLM mode"))
|
| 79 |
+
continue
|
| 80 |
+
api_key = args.api_key or os.getenv("OPENAI_API_KEY", "")
|
| 81 |
+
spec = plan_query_with_llm(case["question"], df, api_key=api_key, model=args.model)
|
| 82 |
+
else:
|
| 83 |
+
results.append(CaseResult(cid, False, f"Unknown mode '{mode}'"))
|
| 84 |
+
continue
|
| 85 |
+
|
| 86 |
+
out = execute_query(spec, df)
|
| 87 |
+
ok, details = _check_expected(out, expected)
|
| 88 |
+
results.append(CaseResult(cid, ok, details))
|
| 89 |
+
except Exception as e:
|
| 90 |
+
results.append(CaseResult(cid, False, f"Exception: {e}"))
|
| 91 |
+
|
| 92 |
+
passed = sum(1 for r in results if r.passed)
|
| 93 |
+
total = len(results)
|
| 94 |
+
|
| 95 |
+
print("\n=== Cleaning report ===")
|
| 96 |
+
print({"rows": report.rows, "fixes": report.fixes, "warnings": report.warnings})
|
| 97 |
+
|
| 98 |
+
print("\n=== Benchmark results ===")
|
| 99 |
+
for r in results:
|
| 100 |
+
status = "PASS" if r.passed else "FAIL"
|
| 101 |
+
print(f"[{status}] {r.case_id}: {r.details}")
|
| 102 |
+
|
| 103 |
+
print(f"\nSummary: {passed}/{total} passed")
|
| 104 |
+
return 0 if passed == total else 1
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
p = argparse.ArgumentParser(description="Run benchmark evaluation for the AI Data Validation Agent")
|
| 109 |
+
p.add_argument("--csv", required=True, help="Path to CSV dataset")
|
| 110 |
+
p.add_argument("--benchmarks", default="src/eval/benchmarks.json", help="Path to benchmarks.json")
|
| 111 |
+
p.add_argument("--api-key", default="", help="OpenAI API key (optional; only needed for llm-mode cases)")
|
| 112 |
+
p.add_argument("--model", default="gpt-4.1-mini", help="Model for llm-mode cases")
|
| 113 |
+
raise SystemExit(run(p.parse_args()))
|
src/tests/test_query.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from src.core.query import QuerySpec, FilterSpec, execute_query
|
| 3 |
+
|
| 4 |
+
def test_execute_query_eq():
|
| 5 |
+
df = pd.DataFrame([
|
| 6 |
+
{"Name":"A", "Department":"Artificial Intelligence"},
|
| 7 |
+
{"Name":"B", "Department":"Data Science"},
|
| 8 |
+
])
|
| 9 |
+
spec = QuerySpec(select=["Name"], filters=[FilterSpec(column="Department", op="eq", value="Artificial Intelligence")])
|
| 10 |
+
out = execute_query(spec, df)
|
| 11 |
+
assert out["Name"].tolist() == ["A"]
|
src/tests_agent/test_offline_runner.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from src.agent.offline_runner import run_offline_agent
|
| 3 |
+
|
| 4 |
+
def test_offline_runner_fails_then_passes():
|
| 5 |
+
bad = json.dumps({
|
| 6 |
+
"employees": [{
|
| 7 |
+
"user_id": "101", # wrong type
|
| 8 |
+
"name": "", # invalid (min length)
|
| 9 |
+
"department": "ai" # invalid enum
|
| 10 |
+
}]
|
| 11 |
+
})
|
| 12 |
+
|
| 13 |
+
good = json.dumps({
|
| 14 |
+
"employees": [{
|
| 15 |
+
"user_id": 101,
|
| 16 |
+
"name": "Michael Chen",
|
| 17 |
+
"age": 29,
|
| 18 |
+
"email": None,
|
| 19 |
+
"salary": 120000,
|
| 20 |
+
"join_date": None,
|
| 21 |
+
"department": "Artificial Intelligence",
|
| 22 |
+
"performance_score": None,
|
| 23 |
+
"location": None,
|
| 24 |
+
"job_title": None
|
| 25 |
+
}]
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
out = run_offline_agent("raw", [bad, good], max_attempts=2)
|
| 29 |
+
assert out["result"] is not None
|
| 30 |
+
assert len(out["log"]) >= 3
|
src/tests_agent/tests_schema.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.agent.schemas import ExtractedData
|
| 2 |
+
|
| 3 |
+
def test_schema_accepts_valid_employee():
|
| 4 |
+
payload = {
|
| 5 |
+
"employees": [
|
| 6 |
+
{
|
| 7 |
+
"user_id": 101,
|
| 8 |
+
"name": "Michael Chen",
|
| 9 |
+
"age": 29,
|
| 10 |
+
"email": None,
|
| 11 |
+
"salary": 120000,
|
| 12 |
+
"join_date": "2024-01-01",
|
| 13 |
+
"department": "Artificial Intelligence",
|
| 14 |
+
"performance_score": 9.5,
|
| 15 |
+
"location": "Chicago",
|
| 16 |
+
"job_title": "Engineer",
|
| 17 |
+
}
|
| 18 |
+
]
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
data = ExtractedData.model_validate(payload)
|
| 22 |
+
assert data.employees[0].user_id == 101
|
test_inputs/case01_simple.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ID: 101
|
| 2 |
+
Name: michael chen
|
| 3 |
+
Age: twenty nine
|
| 4 |
+
Dept: ai
|
| 5 |
+
Salary: $120,000
|
| 6 |
+
Performance: 11
|
| 7 |
+
Email: michael at gmail.com
|
test_inputs/case02_word_numbers.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ID: 102
|
| 2 |
+
Name: maria lopez
|
| 3 |
+
Age: thirty two
|
| 4 |
+
Dept: ai
|
| 5 |
+
Salary: $120,000
|
| 6 |
+
Performance: nine
|
| 7 |
+
Email: maria at gmail.com
|
test_inputs/case03_missing_fields.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ID: 103
|
| 2 |
+
Name: kevin brown
|
| 3 |
+
Dept: Data Science
|
| 4 |
+
Salary: 88000
|
test_inputs/case04_bad_performance.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ID: 104
|
| 2 |
+
Name: Alice Green
|
| 3 |
+
Age: 27
|
| 4 |
+
Dept: ML
|
| 5 |
+
Salary: 100000
|
| 6 |
+
Performance: 15
|
| 7 |
+
Email: alice@company.com
|
test_inputs/case05_weird_date.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ID: 105
|
| 2 |
+
Name: David Chen
|
| 3 |
+
Age: 29
|
| 4 |
+
Dept: AI/ML
|
| 5 |
+
Salary: 99000
|
| 6 |
+
Join: 14th Feb 2024
|
| 7 |
+
Performance: 7.8
|
test_inputs/case06_random_noise.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### Employee Record ###
|
| 2 |
+
blah blah
|
| 3 |
+
ID => 106
|
| 4 |
+
Employee Name: robert taylor
|
| 5 |
+
AGE: 31
|
| 6 |
+
Department: machine learning
|
| 7 |
+
salary approx $110,000 per year
|
| 8 |
+
performance rating 8
|
| 9 |
+
END RECORD
|
test_inputs/case07_duplicate_fields.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ID: 107
|
| 2 |
+
Name: Emma Watson
|
| 3 |
+
Name: emma watson
|
| 4 |
+
Age: 26
|
| 5 |
+
Dept: DataScience
|
| 6 |
+
Salary: $87000
|
| 7 |
+
Performance: 8.2
|
test_inputs/case08_two_records.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Employee 1:
|
| 2 |
+
ID: 201
|
| 3 |
+
Name: michael scott
|
| 4 |
+
Age: forty five
|
| 5 |
+
Dept: ai
|
| 6 |
+
Salary: 150000
|
| 7 |
+
Performance: 9
|
| 8 |
+
|
| 9 |
+
Employee 2:
|
| 10 |
+
ID: 202
|
| 11 |
+
Name: jim halpert
|
| 12 |
+
Age: 30
|
| 13 |
+
Department: sales
|
| 14 |
+
Salary: 85000
|
| 15 |
+
Performance: 7
|
test_inputs/case09_no_ids.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Name: Sarah Connor
|
| 2 |
+
Age: 33
|
| 3 |
+
Dept: Artificial Intelligence
|
| 4 |
+
Salary: 115000
|
| 5 |
+
Performance: 8.7
|
test_inputs/case10_conflicting_values.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ID: 301
|
| 2 |
+
Name: Peter Parker
|
| 3 |
+
Age: 25
|
| 4 |
+
Age: 40
|
| 5 |
+
Dept: AI
|
| 6 |
+
Salary: 70000
|
| 7 |
+
Performance: 6
|
test_inputs/case11_unstructured_paragraph.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Michael Jordan joined the company recently. His ID is 401.
|
| 2 |
+
He works in AI. He earns around $130,000 annually.
|
| 3 |
+
Performance is 9.2. He is 35 years old.
|
test_inputs/case12_partial_sentences.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
402, linda jones, thirty, data science, 92000, performance 8
|
test_inputs/case13_garbage_mixed.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
%%%%%%%
|
| 2 |
+
Random text here.
|
| 3 |
+
Employee start.
|
| 4 |
+
ID: 501
|
| 5 |
+
Name: Tony Stark
|
| 6 |
+
Age: genius
|
| 7 |
+
Dept: AI
|
| 8 |
+
Salary: 300000
|
| 9 |
+
Performance: 12
|
| 10 |
+
Email: tony@starkindustries
|
| 11 |
+
END
|
test_inputs/case14_large_block.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ID: 601 Name: A Age: 25 Dept: AI Salary: 90000 Performance: 7
|
| 2 |
+
ID: 602 Name: B Age: 26 Dept: AI Salary: 91000 Performance: 8
|
| 3 |
+
ID: 603 Name: C Age: 27 Dept: AI Salary: 92000 Performance: 9
|
| 4 |
+
ID: 604 Name: D Age: 28 Dept: AI Salary: 93000 Performance: 6
|
| 5 |
+
ID: 605 Name: E Age: 29 Dept: AI Salary: 94000 Performance: 5
|
test_inputs/case15_extreme_noise.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
I have an employee somewhere. ID maybe 701?
|
| 2 |
+
He is probably 28-ish. Works in something like AI.
|
| 3 |
+
Salary could be 100k?? Not sure.
|
| 4 |
+
Performance: maybe 8 or 9.
|