Siddhesh Patil commited on
Commit
b67668b
·
0 Parent(s):

Initial commit - Self-Correcting Data Validation Agent

Browse files
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ .env
4
+ .env.*
5
+ *.pyc
6
+ .DS_Store
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AI Data Validation Agent (Industry-ready)
2
+
3
+ A Streamlit app that:
4
+ - Cleans & normalizes messy HR-like tabular data (emails, dates, ages, salaries, departments)
5
+ - Produces an **audit report** of what was fixed
6
+ - Answers natural-language questions **deterministically** using pandas (not "LLM guesses")
7
+ - Uses an LLM only to convert the user's question into a small JSON query spec + to explain results
8
+
9
+ ## Run locally
10
+
11
+ ```bash
12
+ python -m venv .venv && source .venv/bin/activate
13
+ pip install -r requirements.txt
14
+ streamlit run app.py
15
+ ```
16
+
17
+ ## Environment variables
18
+
19
+ Set your OpenAI key in **Streamlit sidebar** or via env var:
20
+
21
+ ```bash
22
+ export OPENAI_API_KEY="..."
23
+ ```
24
+
25
+
26
+ ## Evaluation (accuracy benchmarks)
27
+
28
+ Run deterministic benchmarks (no LLM required):
29
+
30
+ ```bash
31
+ python -m venv .venv && source .venv/bin/activate
32
+ pip install -r requirements.txt
33
+ python -m src.eval.run_eval --csv your_data.csv
34
+ ```
35
+
36
+ Optional end-to-end LLM evaluation:
37
+ - Set `OPENAI_API_KEY`, and change a benchmark case `mode` to `"llm"` in `src/eval/benchmarks.json`.
38
+ - Then re-run the command above.
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import streamlit as st
4
+ import pandas as pd
5
+ from src.agent.graph import run_agent
6
+ from src.core.cleaning import clean_dataframe
7
+ from src.core.query import plan_query_with_llm, execute_query, summarize_results_with_llm
8
+ from src.core.security import basic_injection_check
9
+
10
+ st.set_page_config(page_title="AI Data Validation Agent (Industry-ready)", page_icon="🤖", layout="wide")
11
+
12
+ st.title("🤖 AI Data Validation Agent (Industry-ready)")
13
+ st.caption("Deterministic pandas answers + LLM planning/summarization (no LLM guessing).")
14
+
15
+ with st.sidebar:
16
+ st.header("Configuration")
17
+ api_key = st.text_input("OpenAI API Key", value=os.getenv("OPENAI_API_KEY",""), type="password")
18
+ model = st.selectbox("Model", ["gpt-4.1-mini","gpt-4.1","gpt-4o-mini","gpt-4o"], index=0)
19
+ st.divider()
20
+ uploaded = st.file_uploader("Upload CSV", type=["csv"])
21
+ st.divider()
22
+ st.markdown("**Tip:** Clean the data first, then ask questions.")
23
+
24
+ if "df_raw" not in st.session_state:
25
+ st.session_state.df_raw = None
26
+ if "df_clean" not in st.session_state:
27
+ st.session_state.df_clean = None
28
+ if "clean_report" not in st.session_state:
29
+ st.session_state.clean_report = None
30
+
31
+ tab1, tab2, tab3 = st.tabs(["📄 Data cleaning", "💬 Ask questions", "🧠 Self-correcting agent"])
32
+
33
+
34
+ with tab1:
35
+ st.subheader("1) Upload & clean")
36
+ if uploaded is None:
37
+ st.info("Upload a CSV from the sidebar.")
38
+ else:
39
+ df = pd.read_csv(uploaded)
40
+ st.session_state.df_raw = df
41
+ st.write("Raw preview")
42
+ st.dataframe(df.head(20), use_container_width=True)
43
+
44
+ if st.button("Clean & normalize", type="primary"):
45
+ dfc, report = clean_dataframe(df)
46
+ st.session_state.df_clean = dfc
47
+ st.session_state.clean_report = report
48
+
49
+ if st.session_state.df_clean is not None:
50
+ st.success("Cleaned dataset ready ✅")
51
+ report = st.session_state.clean_report
52
+ st.write("Cleaning report")
53
+ st.json({"rows": report.rows, "fixes": report.fixes, "warnings": report.warnings})
54
+ st.write("Clean preview")
55
+ st.dataframe(st.session_state.df_clean, use_container_width=True)
56
+
57
+ csv_bytes = st.session_state.df_clean.to_csv(index=False).encode("utf-8")
58
+ st.download_button("Download cleaned CSV", data=csv_bytes, file_name="cleaned.csv", mime="text/csv")
59
+
60
+ with tab2:
61
+ st.subheader("2) Ask deterministic questions")
62
+ if st.session_state.df_clean is None:
63
+ st.warning("Clean your dataset first (Data cleaning tab).")
64
+ else:
65
+ question = st.text_input("Ask a question about the dataset", placeholder='e.g., "Names of users in Artificial Intelligence department"')
66
+ colA, colB = st.columns([1,1])
67
+ with colA:
68
+ run = st.button("Run query", type="primary")
69
+ with colB:
70
+ show_plan = st.checkbox("Show query plan (JSON)", value=False)
71
+
72
+ if run:
73
+ if not api_key:
74
+ st.error("Please add your OpenAI API key in the sidebar.")
75
+ elif not question.strip():
76
+ st.error("Please type a question.")
77
+ else:
78
+ blocked, msg = basic_injection_check(question)
79
+ if blocked:
80
+ st.error(msg)
81
+ else:
82
+ try:
83
+ spec = plan_query_with_llm(question, st.session_state.df_clean, api_key=api_key, model=model)
84
+ if show_plan:
85
+ st.code(spec.model_dump_json(indent=2), language="json")
86
+
87
+ result = execute_query(spec, st.session_state.df_clean)
88
+
89
+ st.write("Result table")
90
+ st.dataframe(result, use_container_width=True)
91
+
92
+ answer = summarize_results_with_llm(question, result, api_key=api_key, model=model)
93
+ st.markdown("### Answer")
94
+ st.write(answer)
95
+
96
+ except Exception as e:
97
+ st.error(str(e))
98
+
99
+ st.divider()
100
+ st.markdown("### Why this is accurate")
101
+ st.markdown("- LLM only creates a small JSON query plan.\n- Pandas executes it deterministically.\n- LLM only summarizes already computed results.")
102
+ from src.agent.graph import run_agent
103
+ import pandas as pd
104
+
105
+ with tab3:
106
+ st.subheader("Self-Correcting Data Validation Agent")
107
+ st.caption("Paste messy data → Extract JSON → Validate → Auto-correct retries → Final schema-perfect output (NO hallucination)")
108
+
109
+ raw = st.text_area("Paste messy employee data (any format)", height=220)
110
+ max_attempts = st.slider("Max retries", 1, 6, 3)
111
+
112
+ # --- 1) VISUALIZE STATE MACHINE (diagram) ---
113
+ st.markdown("### 🧭 State Machine (Extract → Validate → Correct → Finalize)")
114
+
115
+ dot = """
116
+ digraph G {
117
+ rankdir=LR;
118
+ node [shape=box, style="rounded,filled", color="#444444", fillcolor="#F4F6F8"];
119
+
120
+ Extract [label="extract"];
121
+ Validate [label="validate"];
122
+ Correct [label="correct"];
123
+ Finalize [label="finalize"];
124
+
125
+ Extract -> Validate;
126
+ Validate -> Finalize [label="pass OR max_retries"];
127
+ Validate -> Correct [label="fail AND retries_left"];
128
+ Correct -> Validate;
129
+ }
130
+ """
131
+ try:
132
+ st.graphviz_chart(dot, use_container_width=True)
133
+ except Exception:
134
+ st.code(
135
+ "extract → validate → (pass) finalize\n"
136
+ " ↘ (fail) correct → validate (loop)\n",
137
+ language="text"
138
+ )
139
+
140
+ if st.button("Run Agent", type="primary"):
141
+ if not api_key:
142
+ st.error("Please add your OpenAI API key in the sidebar.")
143
+ st.stop()
144
+ if not raw.strip():
145
+ st.error("Paste some messy data first.")
146
+ st.stop()
147
+
148
+ with st.spinner("Running extract → validate → correct loop..."):
149
+ final_state = run_agent(raw, api_key=api_key, model=model, max_attempts=max_attempts)
150
+
151
+ log = final_state.get("log", [])
152
+ result = final_state.get("result")
153
+
154
+ # --- 3) CORRECTION COUNT SUMMARY (metrics) ---
155
+ # attempts used
156
+ attempts_used = max((x.get("attempt", 0) for x in log), default=0)
157
+
158
+ # counts
159
+ employees_n = len(result.get("employees", [])) if result else 0
160
+ rejected_n = len(result.get("rejected", [])) if result else 0
161
+
162
+ # how many correct steps happened
163
+ correct_steps = sum(1 for x in log if x.get("step") == "correct")
164
+ validate_fails = sum(1 for x in log if x.get("step") == "validate" and x.get("status") == "fail")
165
+
166
+ st.markdown("### 📊 Run Summary")
167
+ c1, c2, c3, c4 = st.columns(4)
168
+ c1.metric("Attempts used", attempts_used if attempts_used else 1)
169
+ c2.metric("Corrections", correct_steps)
170
+ c3.metric("Valid employees", employees_n)
171
+ c4.metric("Rejected records", rejected_n)
172
+
173
+ # Optional: show pass/fail clearly
174
+ if result is None:
175
+ st.error("Could not produce schema-valid JSON within retry limit.")
176
+ else:
177
+ st.success("Schema-valid output ✅")
178
+
179
+ # --- 2) BEFORE / AFTER COMPARISON ---
180
+ st.markdown("### 🔁 Before vs After")
181
+ left, right = st.columns(2)
182
+
183
+ with left:
184
+ st.markdown("#### Before (Raw Input)")
185
+ st.code(raw.strip(), language="text")
186
+
187
+ with right:
188
+ st.markdown("#### After (Schema Output)")
189
+ if result is None:
190
+ st.code(final_state.get("last_json_text", ""), language="json")
191
+ else:
192
+ st.code(json.dumps(result, indent=2, default=str), language="json")
193
+
194
+ # --- Correction Log (keep your existing) ---
195
+ st.markdown("### 🧾 Correction Log")
196
+ st.json(log)
197
+
198
+ # If failed, stop here
199
+ if result is None:
200
+ st.markdown("### Last JSON Attempt (debug)")
201
+ st.code(final_state.get("last_json_text", ""), language="json")
202
+ st.stop()
203
+
204
+ # -------- Valid employees table --------
205
+ st.markdown("### ✅ Valid Employees")
206
+ employees = result.get("employees", [])
207
+ if employees:
208
+ df_emp = pd.DataFrame(employees)
209
+ st.dataframe(df_emp, use_container_width=True)
210
+ else:
211
+ st.info("No valid employees extracted (all records were rejected).")
212
+
213
+ # -------- Rejected records table --------
214
+ st.markdown("### 🚫 Rejected Records (No hallucination)")
215
+ rejected = result.get("rejected", [])
216
+ if rejected:
217
+ rej_rows = []
218
+ for r in rejected:
219
+ rej_rows.append(
220
+ {
221
+ "raw_record": r.get("raw_record", ""),
222
+ "reasons": "; ".join(r.get("reasons", [])),
223
+ }
224
+ )
225
+ df_rej = pd.DataFrame(rej_rows)
226
+ st.dataframe(df_rej, use_container_width=True)
227
+ else:
228
+ st.info("No rejected records. Everything was schema-valid.")
229
+
230
+ # -------- Download --------
231
+ st.download_button(
232
+ "Download JSON",
233
+ data=json.dumps(result, indent=2, default=str),
234
+ file_name="validated_output.json",
235
+ mime="application/json",
236
+ )
employee_dataset_50rows.csv ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ user_id,Name,Age,Email,Salary,Join_Date,Department,Performance_Score,Location,Job_Title
2
+ 1, Sarah Johnson ,28,sarah.j@techcorp.com,95000,2023-01-15,AI,8.5,New York,Senior Engineer
3
+ 2,michael chen, thirty two ,michael at gmail.com,110000 ,15/02/2023,Data Science,9.2,San Francisco,Lead Analyst
4
+ 3,EMILY DAVIS,,emily@techcorp,87000,2023/03/10,ai,7.8,NYC,Engineer
5
+ 4,James Wilson,twenty nine,james.wilson@company.com,92000,01-04-2023,Machine Learning,8.7,San Francisco,ML Engineer
6
+ 5, Robert Brown ,35,robert@techcorp.com,$125000,2023-05-20,AI , 9.5,New York,Principal Engineer
7
+ 6,Jennifer Lee,27,jennifer@@techcorp.com,NaN,2023-06-01,Data science,8.3,Chicago,Data Scientist
8
+ 7,Sarah Johnson,28,sarah.j@techcorp.com,95000,2023-01-15,AI,8.5,New York,Senior Engineer
9
+ 8,David Kim, 31, ,103000,2023-07-15,ML,nine,Seattle,Senior ML Engineer
10
+ 9,Lisa Anderson,26,lisa.anderson@tech.com,88000,2023-08-01,DataScience,8.9,Boston,Data Engineer
11
+ 10,,thirty,maria@techcorp.com,94000,2023-09-10,AI,8.6,New York,Engineer
12
+ 11,JOHN MARTINEZ, ,john.m@company.com,98000,12th Oct 2023,Artificial Intelligence,8.2,Chicago,Senior Engineer
13
+ 12,Amanda White,29,amanda@techcorp.com, 102000 ,2023-11-01,AI/ML,9.1,San Francisco,Tech Lead
14
+ 13,Chris Taylor,33,chris@@company.com,115000,2023-12-01,data science,8.8,Seattle,Principal Analyst
15
+ 14,Jessica Moore,twenty five,jessica@techcorp.com,85000USD,2024-01-05,AI,7.9,Boston,Junior Engineer
16
+ 15,daniel garcia,30,daniel at company.com,99000,2024-02-01,MachineLearning ,8.4,NYC,ML Engineer
17
+ 16,PATRICIA THOMAS,34,patricia@tech,120000,2024-02-15,AI,9.3,San Francisco,Staff Engineer
18
+ 17,Ryan Harris,28,ryan.h@techcorp.com,96000,10/03/2024,Data Science,8.5,Chicago,Senior Analyst
19
+ 18,Michelle Clark,27,michelle@company.com,$91000,2024-03-20,ai,8.1,New York,Engineer
20
+ 19,Kevin Rodriguez, 29 ,kevin@@techcorp.com,105000,01-04-2024,ML,9.0,Seattle,Senior Engineer
21
+ 20,Laura Lewis,twenty six,laura.lewis@tech.com,87000,2024-04-15,DataScience,7.7,Boston,Data Analyst
22
+ 21, Mark Walker ,32,mark@techcorp.com,NaN,2024-05-01,AI,8.7,San Francisco,Senior Engineer
23
+ 22,Angela Hall,28,angela at gmail.com,93000 ,15/05/2024,Data science,eight,NYC,Data Scientist
24
+ 23,Thomas Allen,35,thomas@company,112000,2024/06/01,Artificial Intelligence,9.4,Chicago,Principal Engineer
25
+ 24,Karen Young,27,karen.y@techcorp.com,89000,05-06-2024,ai,8.3,Seattle,Engineer
26
+ 25,Steven King,thirty one,steven@@tech.com,107000,2024-07-10,Machine Learning,8.9,Boston,ML Engineer
27
+ 26,DONNA WRIGHT,29,donna@techcorp.com,$98000,2024-07-20,AI/ML,8.6,San Francisco,Tech Lead
28
+ 27,Jason Lopez,26,jason.lopez@company.com, 85000 ,2024-08-01,Data Science,7.8,New York,Junior Analyst
29
+ 28,Nancy Hill,thirty,nancy at techcorp.com,101000,15th Aug 2024,ai,9.1,Chicago,Senior Engineer
30
+ 29,paul scott,28,paul@tech,95000,2024/09/01,DataScience,8.4,Seattle,Data Engineer
31
+ 30,,27,betty@techcorp.com,88000,2024-09-15,ML,8.2,Boston,Engineer
32
+ 31,Mark Green,33,mark.g@@company.com,118000USD,01-10-2024,Artificial Intelligence,9.2,San Francisco,Staff Engineer
33
+ 32,Sandra Adams,twenty eight,sandra@techcorp.com,92000,2024-10-10,AI,8.0,NYC,Engineer
34
+ 33,GEORGE BAKER,29,george at company.com, 106000,2024/10/20,Data science,8.8,Chicago,Senior Analyst
35
+ 34,Carol Gonzalez,31,carol@tech.com,$99000,2024-11-01,Machine Learning,8.5,Seattle,ML Engineer
36
+ 35,Brian Nelson,26,brian@@techcorp.com,NaN,2024-11-15,ai,7.9,Boston,Junior Engineer
37
+ 36, Susan Carter ,thirty two,susan@company.com,114000,01-12-2024,AI/ML,9.3,San Francisco,Principal Engineer
38
+ 37,edward mitchell,27,edward.m@techcorp.com,90000 ,15/12/2024,DataScience,8.3,New York,Data Scientist
39
+ 38,Margaret Perez,28,margaret at tech.com,97000,2024/12/20,AI,8.7,Chicago,Senior Engineer
40
+ 39,JOSEPH ROBERTS,thirty,joseph@@company.com,108000,2025-01-05,Data Science,nine,Seattle,Tech Lead
41
+ 40,Lisa Turner,29,lisa.t@techcorp,103000,2025-01-15,Machine Learning,8.6,Boston,Senior ML Engineer
42
+ 41,Frank Phillips,27,frank@techcorp.com,$94000,20/01/2025,ai,8.1,San Francisco,Engineer
43
+ 42,Helen Campbell, 26 ,helen at company.com,86000,2025/02/01,DataScience,7.8,NYC,Data Analyst
44
+ 43,matthew parker,31,matthew@@tech.com, 111000 ,05-02-2025,Artificial Intelligence,9.0,Chicago,Principal Analyst
45
+ 44,,twenty nine,elizabeth@techcorp.com,100000USD,2025-02-15,AI/ML,8.8,Seattle,Tech Lead
46
+ 45,ANTHONY EVANS,28,anthony@company,96000,2025/03/01,Data science,8.4,Boston,Senior Analyst
47
+ 46, Barbara Edwards ,30,barbara.e@techcorp.com,NaN,10th Mar 2025,Machine Learning,8.9,San Francisco,ML Engineer
48
+ 47,kenneth collins,27,kenneth at tech.com,91000 ,2025-03-20,ai,8.2,New York,Engineer
49
+ 48,Dorothy Stewart,thirty three,dorothy@@company.com,$104000,01-04-2025,DataScience,9.1,Chicago,Senior Data Scientist
50
+ 49,ANDREW SANCHEZ,26,andrew@techcorp.com,87000,2025/04/10,AI,7.7,Seattle,Junior Engineer
51
+ 50,donna morris,29,donna.m at company.com, 99000 ,2025-04-20,Data Science,8.5,Boston,Data Analyst
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.31
2
+ pandas>=2.0
3
+ python-dateutil>=2.8
4
+ openai>=1.0.0
5
+ pydantic>=2.0
6
+ rapidfuzz>=3.0
7
+ langgraph>=0.2.0
8
+ pytest>=8.0
src/agent/graph.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import math
5
+ from typing import Any, Dict, List, Optional, TypedDict
6
+
7
+ from langgraph.graph import StateGraph, END
8
+ from pydantic import ValidationError
9
+
10
+ from src.agent.schemas import ExtractedData
11
+
12
+
13
+ # ---------- JSON safety ----------
14
+ def _json_safe(obj):
15
+ """Recursively convert NaN/inf to None so payload becomes valid JSON."""
16
+ if obj is None:
17
+ return None
18
+ if isinstance(obj, float):
19
+ if math.isnan(obj) or math.isinf(obj):
20
+ return None
21
+ return obj
22
+ if isinstance(obj, dict):
23
+ return {k: _json_safe(v) for k, v in obj.items()}
24
+ if isinstance(obj, list):
25
+ return [_json_safe(v) for v in obj]
26
+ return obj
27
+
28
+
29
+ # ---------- OpenAI helper ----------
30
+ def _openai_client(api_key: str):
31
+ from openai import OpenAI
32
+
33
+ # robust for flaky networks
34
+ return OpenAI(api_key=api_key, timeout=60, max_retries=5)
35
+
36
+
37
+ EXTRACT_SYSTEM = """You are a data extraction + validation agent.
38
+
39
+ Your job: convert messy text into STRICT JSON that matches this schema:
40
+
41
+ {
42
+ "employees": [
43
+ {
44
+ "user_id": int,
45
+ "name": string,
46
+ "age": int|null,
47
+ "email": string|null,
48
+ "salary": number|null,
49
+ "join_date": "YYYY-MM-DD"|null,
50
+ "department": one of ["Artificial Intelligence","AI/ML","Machine Learning","Data Science"],
51
+ "performance_score": number|null (0..10),
52
+ "location": string|null,
53
+ "job_title": string|null
54
+ }
55
+ ],
56
+ "rejected": [
57
+ { "raw_record": string, "reasons": [string, ...] }
58
+ ]
59
+ }
60
+
61
+ CRITICAL RULES (NO HALLUCINATION):
62
+ - NEVER invent user_id. If user_id is missing/uncertain, DO NOT guess.
63
+ Put that record into "rejected" with reason "missing user_id".
64
+ - NEVER guess values from vague text like "maybe", "around", "probably", "approx".
65
+ Use null for uncertain optional fields.
66
+ - If a record cannot be made schema-valid WITHOUT guessing required fields, reject it.
67
+ - Do not fabricate emails or domains. If email is invalid -> null (or reject only if required, but email is optional here).
68
+
69
+ Normalization rules:
70
+ - Output JSON ONLY, no markdown.
71
+ - If a field is missing, set it to null (not empty string).
72
+ - Normalize department values:
73
+ AI/ai/Artificial Intelligence -> "Artificial Intelligence"
74
+ AI/ML -> "AI/ML"
75
+ ML/Machine Learning -> "Machine Learning"
76
+ DataScience/Data science -> "Data Science"
77
+ - Convert word numbers (e.g., "twenty nine") to integers when clear.
78
+ - Convert dates to ISO YYYY-MM-DD if possible, else null.
79
+ - Salary: remove $ and commas; if missing, null.
80
+ - performance_score must be 0..10; if value is out of range or unclear -> null.
81
+ """
82
+
83
+ CORRECT_SYSTEM = """You are a self-correcting data validation agent.
84
+
85
+ You will be given:
86
+ - the previous JSON you produced
87
+ - a validation error message describing why it failed
88
+
89
+ Fix the JSON to satisfy the schema.
90
+
91
+ CRITICAL RULES (NO HALLUCINATION):
92
+ - NEVER invent user_id. If user_id is missing/uncertain, reject the record instead of guessing.
93
+ - NEVER guess uncertain values (maybe/around/probably). Use null for optional fields.
94
+ - Prefer moving problematic records to "rejected" with clear reasons rather than fabricating data.
95
+
96
+ Rules:
97
+ - Output JSON ONLY.
98
+ - Keep valid records in "employees".
99
+ - Put non-fixable records in "rejected" with reasons.
100
+ - Use null for missing fields (not empty strings).
101
+ """
102
+
103
+
104
+ # ---------- LangGraph State ----------
105
+ class AgentState(TypedDict):
106
+ raw_text: str
107
+ attempt: int
108
+ max_attempts: int
109
+ last_json_text: str
110
+ validation_error: str
111
+ result: Optional[Dict[str, Any]]
112
+ log: List[Dict[str, Any]]
113
+
114
+
115
+ def _llm_extract(state: AgentState, api_key: str, model: str) -> AgentState:
116
+ client = _openai_client(api_key)
117
+ payload = {"raw_text": state["raw_text"]}
118
+
119
+ resp = client.responses.create(
120
+ model=model,
121
+ input=[
122
+ {"role": "system", "content": EXTRACT_SYSTEM},
123
+ {"role": "user", "content": json.dumps(payload)},
124
+ ],
125
+ temperature=0,
126
+ max_output_tokens=1400,
127
+ )
128
+ out = (resp.output_text or "").strip()
129
+
130
+ state["last_json_text"] = out
131
+ state["log"].append({"step": "extract", "attempt": state["attempt"], "output": out[:2000]})
132
+ return state
133
+
134
+
135
+ def _validate(state: AgentState) -> AgentState:
136
+ try:
137
+ data = ExtractedData.model_validate_json(state["last_json_text"])
138
+ state["result"] = _json_safe(data.model_dump())
139
+ state["validation_error"] = ""
140
+ state["log"].append({"step": "validate", "attempt": state["attempt"], "status": "pass"})
141
+ except ValidationError as e:
142
+ state["result"] = None
143
+ state["validation_error"] = str(e)
144
+ state["log"].append(
145
+ {
146
+ "step": "validate",
147
+ "attempt": state["attempt"],
148
+ "status": "fail",
149
+ "error": state["validation_error"][:2000],
150
+ }
151
+ )
152
+ return state
153
+
154
+
155
+ def _llm_correct(state: AgentState, api_key: str, model: str) -> AgentState:
156
+ client = _openai_client(api_key)
157
+ payload = {
158
+ "previous_json": state["last_json_text"],
159
+ "validation_error": state["validation_error"],
160
+ }
161
+
162
+ resp = client.responses.create(
163
+ model=model,
164
+ input=[
165
+ {"role": "system", "content": CORRECT_SYSTEM},
166
+ {"role": "user", "content": json.dumps(payload)},
167
+ ],
168
+ temperature=0,
169
+ max_output_tokens=1400,
170
+ )
171
+ out = (resp.output_text or "").strip()
172
+
173
+ state["last_json_text"] = out
174
+ state["log"].append({"step": "correct", "attempt": state["attempt"], "output": out[:2000]})
175
+ return state
176
+
177
+
178
+ def _should_retry(state: AgentState) -> str:
179
+ if state["result"] is not None:
180
+ return "finalize"
181
+ if state["attempt"] >= state["max_attempts"]:
182
+ return "finalize"
183
+ return "retry"
184
+
185
+
186
+ def build_graph(api_key: str, model: str):
187
+ g = StateGraph(AgentState)
188
+
189
+ g.add_node("extract", lambda s: _llm_extract(s, api_key, model))
190
+ g.add_node("validate", _validate)
191
+ g.add_node("correct", lambda s: _llm_correct(s, api_key, model))
192
+
193
+ g.set_entry_point("extract")
194
+ g.add_edge("extract", "validate")
195
+ g.add_conditional_edges(
196
+ "validate",
197
+ _should_retry,
198
+ {"retry": "correct", "finalize": END},
199
+ )
200
+
201
+ # after correcting, increment attempt then validate again
202
+ def inc_attempt(state: AgentState) -> AgentState:
203
+ state["attempt"] += 1
204
+ return state
205
+
206
+ g.add_node("inc_attempt", inc_attempt)
207
+ g.add_edge("correct", "inc_attempt")
208
+ g.add_edge("inc_attempt", "validate")
209
+
210
+ return g.compile()
211
+
212
+
213
+ def run_agent(raw_text: str, api_key: str, model: str = "gpt-4.1-mini", max_attempts: int = 3):
214
+ graph = build_graph(api_key, model)
215
+ init: AgentState = {
216
+ "raw_text": raw_text,
217
+ "attempt": 1,
218
+ "max_attempts": max_attempts,
219
+ "last_json_text": "",
220
+ "validation_error": "",
221
+ "result": None,
222
+ "log": [],
223
+ }
224
+ final_state = graph.invoke(init)
225
+ return final_state
src/agent/offline_runner.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from typing import List, Dict, Any, Optional
3
+ from pydantic import ValidationError
4
+ from src.agent.schemas import ExtractedData
5
+
6
+ def run_offline_agent(raw_text: str, candidate_json_outputs: List[str], max_attempts: int = 3):
7
+ """
8
+ Offline testing: instead of calling OpenAI, we feed the agent JSON outputs
9
+ and see if validation passes, and how many attempts it takes.
10
+ """
11
+ log = []
12
+ last = ""
13
+ attempt = 1
14
+
15
+ for out in candidate_json_outputs[:max_attempts]:
16
+ last = out
17
+ log.append({"step": "extract", "attempt": attempt, "output": out})
18
+
19
+ try:
20
+ data = ExtractedData.model_validate_json(out)
21
+ log.append({"step": "validate", "attempt": attempt, "status": "pass"})
22
+ return {"result": data.model_dump(), "log": log, "last_json_text": last}
23
+ except ValidationError as e:
24
+ log.append({"step": "validate", "attempt": attempt, "status": "fail", "error": str(e)})
25
+ attempt += 1
26
+
27
+ return {"result": None, "log": log, "last_json_text": last}
src/agent/schemas.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import date
4
+ from typing import List, Optional, Literal
5
+
6
+ from pydantic import BaseModel, Field, field_validator
7
+
8
+ Department = Literal[
9
+ "Artificial Intelligence",
10
+ "AI/ML",
11
+ "Machine Learning",
12
+ "Data Science",
13
+ ]
14
+
15
+
16
+ class Employee(BaseModel):
17
+ user_id: int
18
+ name: str = Field(min_length=1)
19
+ age: Optional[int] = Field(default=None, ge=16, le=80)
20
+ email: Optional[str] = None
21
+ salary: Optional[float] = Field(default=None, ge=0)
22
+ join_date: Optional[date] = None
23
+ department: Department
24
+ performance_score: Optional[float] = Field(default=None, ge=0, le=10)
25
+ location: Optional[str] = None
26
+ job_title: Optional[str] = None
27
+
28
+ @field_validator("name")
29
+ @classmethod
30
+ def normalize_name(cls, v: str) -> str:
31
+ v = " ".join(str(v).strip().split())
32
+ return v.title()
33
+
34
+
35
+ class RejectedRecord(BaseModel):
36
+ raw_record: str = Field(min_length=1)
37
+ reasons: List[str] = Field(min_length=1)
38
+
39
+
40
+ class ExtractedData(BaseModel):
41
+ # Valid, schema-compliant records only
42
+ employees: List[Employee] = Field(default_factory=list)
43
+
44
+ # Records that cannot be made valid WITHOUT guessing required fields
45
+ rejected: List[RejectedRecord] = Field(default_factory=list)
src/core/cleaning.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from typing import Dict, List, Tuple, Optional
6
+
7
+ import pandas as pd
8
+ from dateutil import parser as dtparser
9
+ from rapidfuzz import process, fuzz
10
+
11
+ WORD_NUMS = {
12
+ "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
13
+ "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
14
+ "eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14,
15
+ "fifteen": 15, "sixteen": 16, "seventeen": 17, "eighteen": 18, "nineteen": 19,
16
+ "twenty": 20, "thirty": 30, "forty": 40, "fifty": 50, "sixty": 60
17
+ }
18
+
19
+ DEPT_CANON = {
20
+ "ai": "Artificial Intelligence",
21
+ "artificial intelligence": "Artificial Intelligence",
22
+ "ai/ml": "AI/ML",
23
+ "ml": "Machine Learning",
24
+ "machine learning": "Machine Learning",
25
+ "data science": "Data Science",
26
+ "datascience": "Data Science",
27
+ }
28
+
29
+ LOCATION_CANON = {
30
+ "nyc": "New York",
31
+ "new york": "New York",
32
+ "san francisco": "San Francisco",
33
+ "chicago": "Chicago",
34
+ "seattle": "Seattle",
35
+ "boston": "Boston",
36
+ }
37
+
38
+ EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$")
39
+
40
+ @dataclass
41
+ class CleaningReport:
42
+ rows: int
43
+ fixes: Dict[str, int]
44
+ warnings: List[str]
45
+
46
+ def _clean_name(s: str) -> Optional[str]:
47
+ if s is None or pd.isna(s):
48
+ return None
49
+ s = str(s).strip()
50
+ s = re.sub(r"\s+", " ", s)
51
+ s = " ".join([w.capitalize() for w in s.split()])
52
+ return s if s else None
53
+
54
+ def _parse_word_number(s: str) -> Optional[int]:
55
+ if s is None:
56
+ return None
57
+ t = str(s).strip().lower()
58
+ if t == "":
59
+ return None
60
+ if re.fullmatch(r"\d+", t):
61
+ return int(t)
62
+ parts = re.split(r"[\s\-]+", t)
63
+ total = 0
64
+ matched = False
65
+ for p in parts:
66
+ if p in WORD_NUMS:
67
+ total += WORD_NUMS[p]
68
+ matched = True
69
+ else:
70
+ return None
71
+ return total if matched else None
72
+
73
+ def _clean_email(s: str) -> Tuple[str, bool]:
74
+ if s is None:
75
+ return "unknown@unknown.com", True
76
+ t = str(s).strip().lower()
77
+ if t == "":
78
+ return "unknown@unknown.com", True
79
+ t = t.replace(" at ", "@").replace(" dot ", ".")
80
+ t = t.replace("..", ".")
81
+ t = re.sub(r"@{2,}", "@", t)
82
+ if EMAIL_RE.match(t):
83
+ return t, (t != str(s).strip().lower())
84
+ if "@" in t and "." not in t.split("@", 1)[1]:
85
+ return t, False
86
+ return t, False
87
+
88
+ def _clean_salary(s: str) -> Tuple[Optional[float], bool]:
89
+ if s is None:
90
+ return None, False
91
+ t = str(s).strip()
92
+ if t == "" or t.lower() == "nan":
93
+ return None, False
94
+ t2 = re.sub(r"[,$]", "", t)
95
+ t2 = re.sub(r"usd", "", t2, flags=re.I).strip()
96
+ try:
97
+ return float(t2), (t2 != t)
98
+ except ValueError:
99
+ return None, False
100
+
101
+ def _clean_date(s: str) -> Tuple[Optional[str], bool]:
102
+ if s is None:
103
+ return None, False
104
+ t = str(s).strip()
105
+ if t == "" or t.lower() == "nan":
106
+ return None, False
107
+ try:
108
+ dt = dtparser.parse(t, dayfirst=False, fuzzy=True)
109
+ iso = dt.date().isoformat()
110
+ return iso, (iso != t)
111
+ except Exception:
112
+ return None, False
113
+
114
+ def _canon_from_map(value: str, mapping: Dict[str, str], threshold: int = 90) -> Tuple[str, bool]:
115
+ raw = (value or "").strip()
116
+ if raw == "":
117
+ return raw, False
118
+ key = raw.lower()
119
+ if key in mapping:
120
+ canon = mapping[key]
121
+ return canon, canon != raw
122
+ match = process.extractOne(key, mapping.keys(), scorer=fuzz.ratio)
123
+ if match and match[1] >= threshold:
124
+ canon = mapping[match[0]]
125
+ return canon, canon != raw
126
+ return raw, False
127
+
128
+ def clean_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, CleaningReport]:
129
+ out = df.copy()
130
+ fixes: Dict[str, int] = {}
131
+ warnings: List[str] = []
132
+
133
+ out.columns = [c.strip() for c in out.columns]
134
+ col_map = {c.lower(): c for c in out.columns}
135
+
136
+ name_col = col_map.get("name")
137
+ age_col = col_map.get("age")
138
+ email_col = col_map.get("email")
139
+ salary_col = col_map.get("salary")
140
+ join_col = col_map.get("join_date") or col_map.get("join date")
141
+ dept_col = col_map.get("department")
142
+ perf_col = col_map.get("performance_score") or col_map.get("performance score")
143
+ loc_col = col_map.get("location")
144
+
145
+ if name_col:
146
+ before = out[name_col].astype(str).tolist()
147
+ out[name_col] = out[name_col].apply(_clean_name)
148
+ fixes["name_normalized"] = sum(b != a for b, a in zip(before, out[name_col].tolist()))
149
+
150
+ if age_col:
151
+ def clean_age(x):
152
+ if x is None:
153
+ return None
154
+ t = str(x).strip().lower()
155
+ if t == "":
156
+ return None
157
+ n = _parse_word_number(t)
158
+ if n is not None:
159
+ return n
160
+ try:
161
+ return int(float(t))
162
+ except Exception:
163
+ return None
164
+ before = out[age_col].tolist()
165
+ out[age_col] = out[age_col].apply(clean_age)
166
+ fixes["age_parsed"] = sum(b != a for b, a in zip(before, out[age_col].tolist()))
167
+
168
+ if email_col:
169
+ before = out[email_col].tolist()
170
+ new_vals, changed = [], 0
171
+ for v in before:
172
+ cleaned, did = _clean_email(v)
173
+ if did:
174
+ changed += 1
175
+ new_vals.append(cleaned)
176
+ out[email_col] = new_vals
177
+ fixes["email_cleaned"] = changed
178
+ invalid = [e for e in out[email_col].tolist() if not EMAIL_RE.match(e)]
179
+ if invalid:
180
+ warnings.append(f"{len(invalid)} email(s) still look invalid (e.g., '{invalid[0]}').")
181
+
182
+ if salary_col:
183
+ before = out[salary_col].tolist()
184
+ vals, changed = [], 0
185
+ for v in before:
186
+ s2, did = _clean_salary(v)
187
+ if did:
188
+ changed += 1
189
+ vals.append(s2)
190
+ out[salary_col] = vals
191
+ fixes["salary_cleaned"] = changed
192
+
193
+ if join_col:
194
+ before = out[join_col].tolist()
195
+ vals, changed = [], 0
196
+ for v in before:
197
+ d2, did = _clean_date(v)
198
+ if did:
199
+ changed += 1
200
+ vals.append(d2)
201
+ out[join_col] = vals
202
+ fixes["join_date_normalized"] = changed
203
+
204
+ if dept_col:
205
+ before = out[dept_col].astype(str).tolist()
206
+ new, changed = [], 0
207
+ for v in before:
208
+ canon, did = _canon_from_map(v, DEPT_CANON, threshold=90)
209
+ if did:
210
+ changed += 1
211
+ new.append(canon)
212
+ out[dept_col] = new
213
+ fixes["department_standardized"] = changed
214
+
215
+ if loc_col:
216
+ before = out[loc_col].astype(str).tolist()
217
+ new, changed = [], 0
218
+ for v in before:
219
+ canon, did = _canon_from_map(v, LOCATION_CANON, threshold=88)
220
+ if did:
221
+ changed += 1
222
+ new.append(canon)
223
+ out[loc_col] = new
224
+ fixes["location_standardized"] = changed
225
+
226
+ if perf_col:
227
+ before = out[perf_col].tolist()
228
+ def clean_perf(x):
229
+ if x is None:
230
+ return None
231
+ t = str(x).strip().lower()
232
+ if t == "" or t == "nan":
233
+ return None
234
+ n = _parse_word_number(t)
235
+ if n is not None:
236
+ return float(n)
237
+ try:
238
+ return float(t)
239
+ except Exception:
240
+ return None
241
+ out[perf_col] = out[perf_col].apply(clean_perf)
242
+ fixes["performance_parsed"] = sum(b != a for b, a in zip(before, out[perf_col].tolist()))
243
+
244
+ return out, CleaningReport(rows=len(out), fixes=fixes, warnings=warnings)
src/core/query.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, List
4
+ import pandas as pd
5
+ from pydantic import BaseModel, Field, ValidationError
6
+ import math
7
+
8
+
9
+ class FilterSpec(BaseModel):
10
+ column: str
11
+ op: str = Field(..., description="one of: eq, neq, contains, in, gte, lte")
12
+ value: Any
13
+
14
+ class QuerySpec(BaseModel):
15
+ select: List[str] = Field(default_factory=list)
16
+ filters: List[FilterSpec] = Field(default_factory=list)
17
+ distinct: bool = True
18
+ limit: int = 50
19
+
20
+ SYSTEM_PROMPT = """You are a data query planner.
21
+ You receive:
22
+ - user_question
23
+ - available_columns
24
+ - sample_values (small)
25
+ Return ONLY valid JSON for QuerySpec with:
26
+ - select: columns needed
27
+ - filters: list of {column, op, value}
28
+ - distinct: true/false
29
+ - limit: integer <= 200
30
+
31
+ Rules:
32
+ - Prefer deterministic, simple filters.
33
+ - If the user asks for a department like "Artificial Intelligence", filter Department equals that exact string.
34
+ - If user says AI department, treat it as Department in ["Artificial Intelligence","AI/ML"] unless user explicitly excludes AI/ML.
35
+ - Do NOT invent columns.
36
+ """
37
+ def _json_safe(obj):
38
+ """Recursively convert NaN/inf to None so payload becomes valid JSON."""
39
+ if obj is None:
40
+ return None
41
+
42
+ if isinstance(obj, float):
43
+ if math.isnan(obj) or math.isinf(obj):
44
+ return None
45
+ return obj
46
+
47
+ if isinstance(obj, dict):
48
+ return {k: _json_safe(v) for k, v in obj.items()}
49
+
50
+ if isinstance(obj, list):
51
+ return [_json_safe(v) for v in obj]
52
+
53
+ return obj
54
+
55
+ def _openai_client(api_key: str):
56
+ from openai import OpenAI
57
+ return OpenAI(api_key=api_key)
58
+
59
+ def plan_query_with_llm(user_question: str, df: pd.DataFrame, api_key: str, model: str = "gpt-4.1-mini") -> QuerySpec:
60
+ cols = list(df.columns)
61
+ sample_df = df.head(8).copy()
62
+ sample_df = sample_df.where(pd.notna(sample_df), None)
63
+ sample = _json_safe(sample_df.to_dict(orient="records"))
64
+
65
+
66
+
67
+ client = _openai_client(api_key)
68
+ import json # add at top of file if not present
69
+
70
+ payload = {
71
+ "user_question": user_question,
72
+ "available_columns": cols,
73
+ "sample_values": sample,
74
+ }
75
+
76
+ resp = client.responses.create(
77
+ model=model,
78
+ input=[
79
+ {"role": "system", "content": SYSTEM_PROMPT},
80
+ {"role": "user", "content": json.dumps(payload)},
81
+ ],
82
+ temperature=0,
83
+ max_output_tokens=600,
84
+ )
85
+
86
+
87
+ text = resp.output_text
88
+ try:
89
+ spec = QuerySpec.model_validate_json(text)
90
+ spec.limit = max(1, min(int(spec.limit), 200))
91
+ return spec
92
+ except ValidationError as e:
93
+ raise ValueError(f"Could not parse model output as QuerySpec JSON. Raw output:\\n{text}\\n\\nError:\\n{e}")
94
+
95
+ def execute_query(spec: QuerySpec, df: pd.DataFrame) -> pd.DataFrame:
96
+ out = df.copy()
97
+
98
+ for f in spec.filters:
99
+ col = f.column
100
+ if col not in out.columns:
101
+ continue
102
+ op = f.op
103
+ val = f.value
104
+
105
+ if op == "eq":
106
+ if val is None:
107
+ out = out[out[col].isna()]
108
+ else:
109
+ out = out[out[col] == val]
110
+ elif op == "neq":
111
+ if val is None:
112
+ out = out[out[col].notna()]
113
+ else:
114
+ out = out[out[col] != val]
115
+ elif op == "contains":
116
+ out = out[out[col].astype(str).str.contains(str(val), case=False, na=False)]
117
+ elif op == "in":
118
+ if not isinstance(val, list):
119
+ val = [val]
120
+ out = out[out[col].isin(val)]
121
+ elif op == "gte":
122
+ out = out[pd.to_numeric(out[col], errors="coerce") >= float(val)]
123
+ elif op == "lte":
124
+ out = out[pd.to_numeric(out[col], errors="coerce") <= float(val)]
125
+
126
+ if spec.select:
127
+ safe_select = [c for c in spec.select if c in out.columns]
128
+ out = out[safe_select]
129
+
130
+ if spec.distinct:
131
+ out = out.drop_duplicates()
132
+
133
+ return out.head(spec.limit)
134
+
135
+ def summarize_results_with_llm(user_question: str, result_df: pd.DataFrame, api_key: str, model: str = "gpt-4.1-mini") -> str:
136
+ client = _openai_client(api_key)
137
+ safe_df = result_df.copy().where(pd.notna(result_df), None)
138
+ preview = _json_safe(safe_df.to_dict(orient="records"))
139
+
140
+
141
+ import json # add at top if not present
142
+
143
+ payload = {"question": user_question, "results": preview}
144
+
145
+ resp = client.responses.create(
146
+ model=model,
147
+ input=[
148
+ {"role": "system", "content": "You are a helpful analyst. Summarize results concisely and accurately."},
149
+ {"role": "user", "content": json.dumps(payload)},
150
+ ],
151
+ temperature=0.2,
152
+ max_output_tokens=500,
153
+ )
154
+
155
+ return resp.output_text
src/core/security.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Tuple
5
+
6
+ INJECTION_PATTERNS = [
7
+ r"ignore (all|any) previous",
8
+ r"system prompt",
9
+ r"reveal.*(key|secret|token)",
10
+ r"exfiltrat",
11
+ r"prompt injection",
12
+ ]
13
+
14
+ def basic_injection_check(user_text: str) -> Tuple[bool, str]:
15
+ t = (user_text or "").lower()
16
+ for pat in INJECTION_PATTERNS:
17
+ if re.search(pat, t):
18
+ return True, "That request looks like a prompt-injection attempt. I can only answer questions about the uploaded dataset."
19
+ return False, ""
src/eval/benchmarks.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_expectations": {
3
+ "notes": "These benchmarks assume a cleaned dataset with standardized Department values."
4
+ },
5
+ "cases": [
6
+ {
7
+ "id": "ai_dept_names_including_aiml",
8
+ "question": "Can you tell me the name of users working in Artificial Intelligence department from all the data entries?",
9
+ "mode": "spec",
10
+ "spec": {
11
+ "select": [
12
+ "Name"
13
+ ],
14
+ "filters": [
15
+ {
16
+ "column": "Department",
17
+ "op": "in",
18
+ "value": [
19
+ "Artificial Intelligence",
20
+ "AI/ML"
21
+ ]
22
+ }
23
+ ],
24
+ "distinct": true,
25
+ "limit": 200
26
+ },
27
+ "expected": {
28
+ "type": "set_equals",
29
+ "column": "Name",
30
+ "values": [
31
+ "Sarah Johnson",
32
+ "Emily Davis",
33
+ "Robert Brown",
34
+ "John Martinez",
35
+ "Amanda White",
36
+ "Jessica Moore",
37
+ "Patricia Thomas",
38
+ "Michelle Clark",
39
+ "Mark Walker",
40
+ "Thomas Allen",
41
+ "Karen Young",
42
+ "Donna Wright",
43
+ "Nancy Hill",
44
+ "Mark Green",
45
+ "Sandra Adams",
46
+ "Brian Nelson",
47
+ "Susan Carter",
48
+ "Margaret Perez",
49
+ "Frank Phillips",
50
+ "Matthew Parker",
51
+ "Kenneth Collins",
52
+ "Andrew Sanchez"
53
+ ]
54
+ }
55
+ },
56
+ {
57
+ "id": "high_performers_ge_9",
58
+ "question": "Show employees with performance score >= 9",
59
+ "mode": "spec",
60
+ "spec": {
61
+ "select": [
62
+ "Name",
63
+ "Department",
64
+ "Performance_Score"
65
+ ],
66
+ "filters": [
67
+ {
68
+ "column": "Performance_Score",
69
+ "op": "gte",
70
+ "value": 9
71
+ }
72
+ ],
73
+ "distinct": true,
74
+ "limit": 200
75
+ },
76
+ "expected": {
77
+ "type": "row_count_gte",
78
+ "min_rows": 1
79
+ }
80
+ },
81
+ {
82
+ "id": "salary_missing",
83
+ "question": "How many employees have missing salary?",
84
+ "mode": "spec",
85
+ "spec": {
86
+ "select": [
87
+ "Salary"
88
+ ],
89
+ "filters": [
90
+ {
91
+ "column": "Salary",
92
+ "op": "eq",
93
+ "value": null
94
+ }
95
+ ],
96
+ "distinct": false,
97
+ "limit": 1000
98
+ },
99
+ "expected": {
100
+ "type": "row_count_gte",
101
+ "min_rows": 1
102
+ }
103
+ }
104
+ ]
105
+ }
src/eval/run_agent_suite.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Run the agent on all files in test_inputs/ and print a clear summary.
3
+
4
+ Usage:
5
+ export OPENAI_API_KEY="sk-..."
6
+ python -m src.eval.run_agent_suite
7
+ """
8
+
9
+ from pathlib import Path
10
+ import json
11
+ import os
12
+ from statistics import mean
13
+
14
+ from src.agent.graph import run_agent
15
+
16
+ TEST_DIR = Path("test_inputs")
17
+ MODEL = "gpt-4.1-mini"
18
+ MAX_ATTEMPTS = 4
19
+
20
+
21
+ def main():
22
+ api_key = os.environ.get("OPENAI_API_KEY", "").strip()
23
+ if not api_key:
24
+ raise SystemExit(
25
+ "OPENAI_API_KEY is not set. Export your key before running the suite:\n\n"
26
+ "export OPENAI_API_KEY=\"sk-...\""
27
+ )
28
+
29
+ inputs = sorted(TEST_DIR.glob("*.txt"))
30
+ if not inputs:
31
+ raise SystemExit("No test_inputs/*.txt files found. Create the sample inputs first.")
32
+
33
+ total = 0
34
+ passed_count = 0
35
+ correct_handling = 0
36
+ attempts_hist = []
37
+ attempts_hist_for_passed = []
38
+
39
+ # OPTIONAL: define expected behavior per file (True=expected PASS/valid handling)
40
+ # For correctness metric we will consider "handled correctly" as either:
41
+ # - produced employees & they match expectations (not available here), or
42
+ # - produced 0 employees and non-empty rejected (for cases that should be rejected)
43
+ # You can expand expected_outcomes if you want to mark specific cases as expected_fail, etc.
44
+ expected_outcomes = {
45
+ # "case09_no_ids.txt": "reject", # example: expected to reject (no user_id)
46
+ # "case15_extreme_noise.txt": "reject",
47
+ }
48
+
49
+ for p in inputs:
50
+ total += 1
51
+ raw = p.read_text()
52
+ print(f"\n=== Running: {p.name} ===")
53
+ try:
54
+ final = run_agent(raw, api_key=api_key, model=MODEL, max_attempts=MAX_ATTEMPTS)
55
+ except Exception as e:
56
+ print(f"[ERROR] agent crashed for {p.name}: {e}")
57
+ continue
58
+
59
+ result = final.get("result")
60
+ log = final.get("log", [])
61
+
62
+ # attempts used is the max attempt number seen in log, fall back to 0
63
+ attempts_used = max((entry.get("attempt", 0) for entry in log), default=0)
64
+
65
+ employees_n = 0
66
+ rejected_n = 0
67
+ if result:
68
+ employees_n = len(result.get("employees", []))
69
+ rejected_n = len(result.get("rejected", []))
70
+
71
+ # Define pass = result is not None (valid schema produced)
72
+ passed = result is not None
73
+
74
+ # Print a concise line
75
+ print(f"{p.name}: {'PASS' if passed else 'FAIL'} | attempts={attempts_used} | employees={employees_n} | rejected={rejected_n}")
76
+
77
+ # Print extra info for failures or suspicious cases
78
+ if not passed:
79
+ print("-> Agent failed to produce schema-valid JSON within retry limit.")
80
+ print("Last JSON attempt:")
81
+ print(final.get("last_json_text", ""))
82
+ else:
83
+ # Optionally print the JSON for inspection of suspicious cases
84
+ if employees_n == 0 and rejected_n > 0:
85
+ print("-> No valid employees extracted; records were rejected (no hallucination).")
86
+ # You can uncomment to always show the JSON
87
+ # print(json.dumps(result, indent=2))
88
+
89
+ # Evaluate "correct handling" heuristically:
90
+ # if expected_outcomes says "reject" and the agent indeed rejected (employees==0 and rejected>0) -> correct
91
+ expected = expected_outcomes.get(p.name)
92
+ handled_correctly = False
93
+ if expected == "reject":
94
+ handled_correctly = (employees_n == 0 and rejected_n > 0)
95
+ else:
96
+ # default heuristic: producing a schema result is considered handling (but inspect counts)
97
+ handled_correctly = passed
98
+
99
+ if handled_correctly:
100
+ correct_handling += 1
101
+
102
+ if passed:
103
+ passed_count += 1
104
+ attempts_hist_for_passed.append(attempts_used)
105
+ attempts_hist.append(attempts_used)
106
+
107
+ # Summary
108
+ print("\n=== SUITE SUMMARY ===")
109
+ print(f"Total cases: {total}")
110
+ print(f"Schema-valid produced (pass): {passed_count}/{total} = {passed_count/total:.2%}")
111
+ print(f"Correct-handling (heuristic expected): {correct_handling}/{total} = {correct_handling/total:.2%}")
112
+ if attempts_hist:
113
+ print(f"Avg attempts (all cases): {mean(attempts_hist):.2f}")
114
+ if attempts_hist_for_passed:
115
+ print(f"Avg attempts (passed cases): {mean(attempts_hist_for_passed):.2f}")
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
src/eval/run_eval.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ import pandas as pd
11
+
12
+ from src.core.cleaning import clean_dataframe
13
+ from src.core.query import QuerySpec, FilterSpec, execute_query, plan_query_with_llm
14
+
15
+
16
+ @dataclass
17
+ class CaseResult:
18
+ case_id: str
19
+ passed: bool
20
+ details: str
21
+
22
+
23
+ def _load_benchmarks(path: Path) -> Dict[str, Any]:
24
+ return json.loads(path.read_text())
25
+
26
+
27
+ def _spec_from_dict(d: Dict[str, Any]) -> QuerySpec:
28
+ filters = [FilterSpec(**f) for f in d.get("filters", [])]
29
+ return QuerySpec(
30
+ select=d.get("select", []),
31
+ filters=filters,
32
+ distinct=bool(d.get("distinct", True)),
33
+ limit=int(d.get("limit", 50)),
34
+ )
35
+
36
+
37
+ def _check_expected(result_df: pd.DataFrame, expected: Dict[str, Any]) -> Tuple[bool, str]:
38
+ et = expected.get("type")
39
+ if et == "set_equals":
40
+ col = expected["column"]
41
+ want = set(expected["values"])
42
+ if col not in result_df.columns:
43
+ return False, f"Missing expected column '{col}'. Columns: {list(result_df.columns)}"
44
+ got = set([x for x in result_df[col].dropna().astype(str).tolist()])
45
+ missing = want - got
46
+ extra = got - want
47
+ if missing or extra:
48
+ return False, f"Set mismatch. Missing={sorted(missing)} Extra={sorted(extra)}"
49
+ return True, "OK"
50
+ if et == "row_count_gte":
51
+ min_rows = int(expected["min_rows"])
52
+ n = len(result_df)
53
+ return (n >= min_rows), f"Rows={n}, expected >= {min_rows}"
54
+ if et == "row_count_equals":
55
+ want = int(expected["rows"])
56
+ n = len(result_df)
57
+ return (n == want), f"Rows={n}, expected == {want}"
58
+ return False, f"Unknown expected.type '{et}'"
59
+
60
+
61
+ def run(args: argparse.Namespace) -> int:
62
+ bench = _load_benchmarks(Path(args.benchmarks))
63
+ df_raw = pd.read_csv(args.csv)
64
+ df, report = clean_dataframe(df_raw)
65
+
66
+ results: List[CaseResult] = []
67
+
68
+ for case in bench["cases"]:
69
+ cid = case["id"]
70
+ mode = case.get("mode", "spec")
71
+ expected = case["expected"]
72
+
73
+ try:
74
+ if mode == "spec":
75
+ spec = _spec_from_dict(case["spec"])
76
+ elif mode == "llm":
77
+ if not args.api_key and not os.getenv("OPENAI_API_KEY"):
78
+ results.append(CaseResult(cid, False, "No API key for LLM mode"))
79
+ continue
80
+ api_key = args.api_key or os.getenv("OPENAI_API_KEY", "")
81
+ spec = plan_query_with_llm(case["question"], df, api_key=api_key, model=args.model)
82
+ else:
83
+ results.append(CaseResult(cid, False, f"Unknown mode '{mode}'"))
84
+ continue
85
+
86
+ out = execute_query(spec, df)
87
+ ok, details = _check_expected(out, expected)
88
+ results.append(CaseResult(cid, ok, details))
89
+ except Exception as e:
90
+ results.append(CaseResult(cid, False, f"Exception: {e}"))
91
+
92
+ passed = sum(1 for r in results if r.passed)
93
+ total = len(results)
94
+
95
+ print("\n=== Cleaning report ===")
96
+ print({"rows": report.rows, "fixes": report.fixes, "warnings": report.warnings})
97
+
98
+ print("\n=== Benchmark results ===")
99
+ for r in results:
100
+ status = "PASS" if r.passed else "FAIL"
101
+ print(f"[{status}] {r.case_id}: {r.details}")
102
+
103
+ print(f"\nSummary: {passed}/{total} passed")
104
+ return 0 if passed == total else 1
105
+
106
+
107
+ if __name__ == "__main__":
108
+ p = argparse.ArgumentParser(description="Run benchmark evaluation for the AI Data Validation Agent")
109
+ p.add_argument("--csv", required=True, help="Path to CSV dataset")
110
+ p.add_argument("--benchmarks", default="src/eval/benchmarks.json", help="Path to benchmarks.json")
111
+ p.add_argument("--api-key", default="", help="OpenAI API key (optional; only needed for llm-mode cases)")
112
+ p.add_argument("--model", default="gpt-4.1-mini", help="Model for llm-mode cases")
113
+ raise SystemExit(run(p.parse_args()))
src/tests/test_query.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from src.core.query import QuerySpec, FilterSpec, execute_query
3
+
4
+ def test_execute_query_eq():
5
+ df = pd.DataFrame([
6
+ {"Name":"A", "Department":"Artificial Intelligence"},
7
+ {"Name":"B", "Department":"Data Science"},
8
+ ])
9
+ spec = QuerySpec(select=["Name"], filters=[FilterSpec(column="Department", op="eq", value="Artificial Intelligence")])
10
+ out = execute_query(spec, df)
11
+ assert out["Name"].tolist() == ["A"]
src/tests_agent/test_offline_runner.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from src.agent.offline_runner import run_offline_agent
3
+
4
+ def test_offline_runner_fails_then_passes():
5
+ bad = json.dumps({
6
+ "employees": [{
7
+ "user_id": "101", # wrong type
8
+ "name": "", # invalid (min length)
9
+ "department": "ai" # invalid enum
10
+ }]
11
+ })
12
+
13
+ good = json.dumps({
14
+ "employees": [{
15
+ "user_id": 101,
16
+ "name": "Michael Chen",
17
+ "age": 29,
18
+ "email": None,
19
+ "salary": 120000,
20
+ "join_date": None,
21
+ "department": "Artificial Intelligence",
22
+ "performance_score": None,
23
+ "location": None,
24
+ "job_title": None
25
+ }]
26
+ })
27
+
28
+ out = run_offline_agent("raw", [bad, good], max_attempts=2)
29
+ assert out["result"] is not None
30
+ assert len(out["log"]) >= 3
src/tests_agent/tests_schema.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.agent.schemas import ExtractedData
2
+
3
+ def test_schema_accepts_valid_employee():
4
+ payload = {
5
+ "employees": [
6
+ {
7
+ "user_id": 101,
8
+ "name": "Michael Chen",
9
+ "age": 29,
10
+ "email": None,
11
+ "salary": 120000,
12
+ "join_date": "2024-01-01",
13
+ "department": "Artificial Intelligence",
14
+ "performance_score": 9.5,
15
+ "location": "Chicago",
16
+ "job_title": "Engineer",
17
+ }
18
+ ]
19
+ }
20
+
21
+ data = ExtractedData.model_validate(payload)
22
+ assert data.employees[0].user_id == 101
test_inputs/case01_simple.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ID: 101
2
+ Name: michael chen
3
+ Age: twenty nine
4
+ Dept: ai
5
+ Salary: $120,000
6
+ Performance: 11
7
+ Email: michael at gmail.com
test_inputs/case02_word_numbers.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ID: 102
2
+ Name: maria lopez
3
+ Age: thirty two
4
+ Dept: ai
5
+ Salary: $120,000
6
+ Performance: nine
7
+ Email: maria at gmail.com
test_inputs/case03_missing_fields.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ID: 103
2
+ Name: kevin brown
3
+ Dept: Data Science
4
+ Salary: 88000
test_inputs/case04_bad_performance.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ID: 104
2
+ Name: Alice Green
3
+ Age: 27
4
+ Dept: ML
5
+ Salary: 100000
6
+ Performance: 15
7
+ Email: alice@company.com
test_inputs/case05_weird_date.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ID: 105
2
+ Name: David Chen
3
+ Age: 29
4
+ Dept: AI/ML
5
+ Salary: 99000
6
+ Join: 14th Feb 2024
7
+ Performance: 7.8
test_inputs/case06_random_noise.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ### Employee Record ###
2
+ blah blah
3
+ ID => 106
4
+ Employee Name: robert taylor
5
+ AGE: 31
6
+ Department: machine learning
7
+ salary approx $110,000 per year
8
+ performance rating 8
9
+ END RECORD
test_inputs/case07_duplicate_fields.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ID: 107
2
+ Name: Emma Watson
3
+ Name: emma watson
4
+ Age: 26
5
+ Dept: DataScience
6
+ Salary: $87000
7
+ Performance: 8.2
test_inputs/case08_two_records.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Employee 1:
2
+ ID: 201
3
+ Name: michael scott
4
+ Age: forty five
5
+ Dept: ai
6
+ Salary: 150000
7
+ Performance: 9
8
+
9
+ Employee 2:
10
+ ID: 202
11
+ Name: jim halpert
12
+ Age: 30
13
+ Department: sales
14
+ Salary: 85000
15
+ Performance: 7
test_inputs/case09_no_ids.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Name: Sarah Connor
2
+ Age: 33
3
+ Dept: Artificial Intelligence
4
+ Salary: 115000
5
+ Performance: 8.7
test_inputs/case10_conflicting_values.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ID: 301
2
+ Name: Peter Parker
3
+ Age: 25
4
+ Age: 40
5
+ Dept: AI
6
+ Salary: 70000
7
+ Performance: 6
test_inputs/case11_unstructured_paragraph.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Michael Jordan joined the company recently. His ID is 401.
2
+ He works in AI. He earns around $130,000 annually.
3
+ Performance is 9.2. He is 35 years old.
test_inputs/case12_partial_sentences.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 402, linda jones, thirty, data science, 92000, performance 8
test_inputs/case13_garbage_mixed.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ %%%%%%%
2
+ Random text here.
3
+ Employee start.
4
+ ID: 501
5
+ Name: Tony Stark
6
+ Age: genius
7
+ Dept: AI
8
+ Salary: 300000
9
+ Performance: 12
10
+ Email: tony@starkindustries
11
+ END
test_inputs/case14_large_block.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ID: 601 Name: A Age: 25 Dept: AI Salary: 90000 Performance: 7
2
+ ID: 602 Name: B Age: 26 Dept: AI Salary: 91000 Performance: 8
3
+ ID: 603 Name: C Age: 27 Dept: AI Salary: 92000 Performance: 9
4
+ ID: 604 Name: D Age: 28 Dept: AI Salary: 93000 Performance: 6
5
+ ID: 605 Name: E Age: 29 Dept: AI Salary: 94000 Performance: 5
test_inputs/case15_extreme_noise.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ I have an employee somewhere. ID maybe 701?
2
+ He is probably 28-ish. Works in something like AI.
3
+ Salary could be 100k?? Not sure.
4
+ Performance: maybe 8 or 9.