bhavika24 commited on
Commit
83b6e91
·
verified ·
1 Parent(s): f3ebab8

Upload engine.py

Browse files
Files changed (1) hide show
  1. engine.py +168 -442
engine.py CHANGED
@@ -1,518 +1,244 @@
 
1
  import os
2
- import re
3
  from openai import OpenAI
4
- from difflib import get_close_matches
5
  from datetime import datetime
6
 
7
- TRANSCRIPT = [] #memory log
8
-
9
- #store interaction in transcript
10
- def log_interaction(user_q, sql=None, result=None, error=None):
11
- TRANSCRIPT.append({
12
- "timestamp": datetime.utcnow().isoformat(),
13
- "question": user_q,
14
- "sql": sql,
15
- "result_preview": result[:10] if isinstance(result, list) else result,
16
- "error": error
17
- })
18
-
19
  # =========================
20
- # SETUP
21
  # =========================
22
 
23
- # Validate API key
24
- api_key = os.getenv("OPENAI_API_KEY")
25
- if not api_key:
26
- raise ValueError("OPENAI_API_KEY environment variable is not set")
27
- client = OpenAI(api_key=api_key)
28
-
29
 
30
  # =========================
31
- # CONVERSATION STATE
32
  # =========================
33
 
34
- LAST_PROMPT_TYPE = None
35
- LAST_SUGGESTED_DATE = None
 
 
36
 
37
- # =========================
38
- # HUMAN RESPONSE HELPERS
39
- # =========================
40
 
41
- def humanize(text):
42
- return f"Sure \n\n{text}"
43
 
44
- def friendly(text):
45
- return text
46
 
47
- def is_confirmation(text):
48
- return text.strip().lower() in ["yes", "yep", "yeah", "ok", "okay", "sure"]
 
 
 
 
49
 
50
- def is_why_question(text):
51
- return text.strip().lower().startswith("why")
52
 
53
- # =========================
54
- # SPELL CORRECTION
55
- # =========================
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- KNOWN_TERMS = [
58
- "patient", "patients",
59
- "admission", "admissions",
60
- "icu", "stay", "icustay",
61
- "diagnosis", "procedure",
62
- "medication", "lab",
63
- "year", "month", "recent", "today"
64
- ]
65
-
66
-
67
- def correct_spelling(q):
68
- words = q.split()
69
- fixed = []
70
- for w in words:
71
- clean = w.lower().strip(",.?")
72
- match = get_close_matches(clean, KNOWN_TERMS, n=1, cutoff=0.8)
73
- fixed.append(match[0] if match else clean)
74
- return " ".join(fixed)
75
 
76
- # =========================
77
- # SCHEMA
78
- # =========================
79
- import json
80
- from functools import lru_cache
81
- def col_desc(desc):#extract description
82
- """Safely extract column description from metadata."""
83
- if isinstance(desc, dict):
84
- return desc.get("description", "")
85
- return str(desc)
86
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- @lru_cache(maxsize=1)
89
- def load_ai_schema():
90
- #load metadata
91
- """Load schema from metadata JSON file with error handling."""
92
- try:
93
- with open("metadata.json", "r") as f:
94
- schema = json.load(f)
95
- if not isinstance(schema, dict):
96
- raise ValueError("Invalid metadata format: expected a dictionary")
97
- return schema
98
- except FileNotFoundError:
99
- raise FileNotFoundError("metadata.json file not found. Please create it with your table metadata.")
100
- except json.JSONDecodeError as e:
101
- raise ValueError(f"Invalid JSON in metadata.json: {str(e)}")
102
- except Exception as e:
103
- raise ValueError(f"Error loading metadata: {str(e)}")
104
 
105
  # =========================
106
- # TABLE MATCHING (CORE LOGIC)
107
  # =========================
108
 
109
- def extract_relevant_tables(question, max_tables=4):
110
- schema = load_ai_schema()
111
- q = question.lower()
112
- tokens = set(q.replace("?", "").replace(",", "").split())
113
-
114
- matched = []
115
-
116
- # Lightweight intent hints - dynamically filter to only include tables that exist
117
- # Map natural language terms to potential table names (check against schema)
118
- all_tables = list(schema.keys())
119
- table_names_lower = [t.lower() for t in all_tables]
120
-
121
- DOMAIN_HINTS = {}
122
-
123
- # Build hints only for tables that actually exist
124
- hint_mappings = {
125
- # Patients & visits
126
- "patient": ["patients"],
127
- "patients": ["patients"],
128
-
129
- "admission": ["admissions"],
130
- "admissions": ["admissions"],
131
- "visit": ["admissions", "icustays"],
132
- "visits": ["admissions", "icustays"],
133
-
134
- # ICU
135
- "icu": ["icustays", "chartevents"],
136
- "stay": ["icustays"],
137
- "stays": ["icustays"],
138
-
139
- # Diagnoses / conditions
140
- "diagnosis": ["diagnoses_icd"],
141
- "diagnoses": ["diagnoses_icd"],
142
- "condition": ["diagnoses_icd"],
143
- "conditions": ["diagnoses_icd"],
144
-
145
- # Procedures
146
- "procedure": ["procedures_icd"],
147
- "procedures": ["procedures_icd"],
148
-
149
- # Medications
150
- "medication": ["prescriptions", "emar", "pharmacy"],
151
- "medications": ["prescriptions", "emar", "pharmacy"],
152
- "drug": ["prescriptions"],
153
- "drugs": ["prescriptions"],
154
-
155
- # Labs & vitals
156
- "lab": ["labevents"],
157
- "labs": ["labevents"],
158
- "vital": ["chartevents"],
159
- "vitals": ["chartevents"],
160
- }
161
- # Only include hints for tables that exist in the schema
162
- for intent, possible_tables in hint_mappings.items():
163
- matching_tables = [t for t in possible_tables if t in table_names_lower]
164
- if matching_tables:
165
- DOMAIN_HINTS[intent] = matching_tables
166
-
167
- # Early exit threshold - if we find a perfect match, we can stop early
168
- VERY_HIGH_SCORE = 10
169
-
170
- for table, meta in schema.items():
171
- score = 0
172
- table_l = table.lower()
173
-
174
- # 1️⃣ Strong signal: table name (exact match is very high confidence)
175
- if table_l in q:
176
- score += 6
177
- # Early exit optimization: if exact table match found, prioritize it
178
- if score >= VERY_HIGH_SCORE:
179
- matched.append((table, score))
180
- continue
181
-
182
- # 2️⃣ Column relevance
183
- for col, desc in meta["columns"].items():
184
- desc_text = col_desc(desc)
185
- desc_tokens = set(desc_text.lower().split())
186
-
187
- col_l = col.lower()
188
- if col_l in q:
189
- score += 3
190
- elif any(tok in col_l for tok in tokens):
191
- score += 1
192
-
193
- # 3️⃣ Description relevance (less weight to avoid false positives)
194
- if meta.get("description"):
195
- desc_tokens = set(col_desc(meta.get("description", "")).lower().split())
196
- # Only count meaningful word matches, not common words
197
- common_words = {"the", "is", "at", "which", "on", "for", "a", "an"}
198
- meaningful_matches = tokens & desc_tokens - common_words
199
- if meaningful_matches:
200
- score += len(meaningful_matches) * 0.5 # Reduced weight
201
-
202
- # 4️⃣ Semantic intent mapping (important - highest priority)
203
- for intent, tables in DOMAIN_HINTS.items():
204
- if intent in q and table_l in tables:
205
- score += 5
206
-
207
- # 5️⃣ Only add if meets minimum threshold (prevents low-quality matches)
208
- # Use lower threshold for small schemas (more lenient)
209
- # Increased threshold from 3 to 4 for better precision, but lower to 2 for small schemas
210
- threshold = 2 if len(schema) <= 5 else 4
211
- if score >= threshold:
212
- matched.append((table, score))
213
-
214
- # Sort by relevance
215
- matched.sort(key=lambda x: x[1], reverse=True)
216
-
217
- # If no matches but schema is very small, return all tables (with lower confidence)
218
- if not matched and len(schema) <= 3:
219
- return list(schema.keys())[:max_tables]
220
-
221
- return [t[0] for t in matched[:max_tables]]
222
 
223
- # =========================
224
- # HUMAN SCHEMA DESCRIPTION
225
- # =========================
226
 
227
- def describe_schema(max_tables=10):#what data you have or which table exist
228
- schema = load_ai_schema()
229
- total_tables = len(schema)
230
-
231
- response = f"Here's the data I currently have access to ({total_tables} tables):\n\n"
232
-
233
- # Show only top N tables to avoid overwhelming output
234
- shown_tables = list(schema.items())[:max_tables]
235
-
236
- for table, meta in shown_tables:
237
- response += f"• **{table.capitalize()}** — {meta['description']}\n"
238
- # Show only first 5 columns per table
239
- for col, desc in list(meta["columns"].items())[:5]:
240
- response += f" - {col}: {col_desc(desc)}\n"
241
-
242
- if len(meta["columns"]) > 5:
243
- response += f" ... and {len(meta['columns']) - 5} more columns\n"
244
- response += "\n"
245
-
246
- if total_tables > max_tables:
247
- response += f"\n... and {total_tables - max_tables} more tables.\n"
248
- response += "Ask about a specific table to see its details.\n\n"
249
-
250
- response += (
251
- "You can ask things like:\n"
252
- "• How many patients are there?\n"
253
- "• Patient count by gender\n"
254
- "• Admissions by year\n\n"
255
- "Just tell me what you want to explore "
256
  )
257
 
258
- return response
259
 
260
- # =========================
261
- # TIME HANDLING
262
- # =========================
263
 
264
- # =========================
265
- # SQL GENERATION
266
- # =========================
267
 
268
- def build_prompt(question):
269
- matched = extract_relevant_tables(question)
270
- full_schema = load_ai_schema()
271
 
272
- if not matched:
273
- available_tables = list(full_schema.keys())[:10]
274
- tables_list = "\n".join(f"- {t}" for t in available_tables)
275
- if len(full_schema) > 10:
276
- tables_list += f"\n... and {len(full_schema) - 10} more tables"
277
 
 
278
  raise ValueError(
279
- "I couldn't find any relevant tables for your question.\n\n"
280
- f"Available tables:\n{tables_list}\n\n"
281
- "Try mentioning a table name or ask: 'what data is available?'"
282
  )
283
 
284
- schema = {t: full_schema[t] for t in matched}
285
 
286
- IMPORTANT_COLS = {
287
- "subject_id", "hadm_id", "stay_id",
288
- "icustay_id", "itemid",
289
- "charttime", "starttime", "endtime"
290
- }
291
 
292
- prompt = """
293
- You are an expert SQLite query generator.
 
294
 
295
- STRICT RULES:
296
- - Use ONLY the tables and columns listed below
297
- - NEVER invent table or column names
298
- - If the answer cannot be derived, return: NOT_ANSWERABLE
299
- - Do NOT explain the SQL
300
- - Do NOT wrap SQL in markdown
301
- - Use explicit JOIN conditions
302
- - Prefer COUNT(*) for totals
303
- - Use these joins only if columns from both tables are required.
304
- - patients.subject_id = admissions.subject_id
305
- - admissions.hadm_id = icustays.hadm_id
306
- - icustays.stay_id = chartevents.stay_id
307
 
 
308
 
309
- Schema:
310
- """
311
 
312
- for table, meta in schema.items():
313
- prompt += f"\nTable: {table}\n"
 
 
 
 
 
314
 
315
- for col, desc in meta["columns"].items():
316
- text = f"{col} {col_desc(desc)}".lower()
 
 
317
 
318
- # Keep columns relevant to question
319
- if any(w in text for w in question.lower().split()):
320
- prompt += f"- {col}\n"
321
 
322
- # Always keep join / key columns
323
- elif col in IMPORTANT_COLS or col.endswith("_id"):
324
- prompt += f"- {col}\n"
 
 
 
 
325
 
326
- # Optional: help LLM with joins (very helpful for MIMIC)
327
- prompt += """
328
- Join hints:
329
- - patients.subject_id ↔ admissions.subject_id
330
- - admissions.hadm_id ↔ icustays.hadm_id
331
- - icustays.stay_id ↔ chartevents.stay_id
332
  """
333
 
334
- prompt += f"\nQuestion: {question}\n"
335
- prompt += "\nUse EXACT table and column names as shown above."
336
-
337
- # Safety cap
338
- if len(prompt) > 6000:
339
- prompt = prompt[:6000] + "\n\n# Schema truncated for safety\n"
340
-
341
- return prompt
342
-
343
- def call_llm(prompt):
344
- """Call OpenAI API with error handling."""
345
- try:
346
- res = client.chat.completions.create(
347
- model="gpt-4.1-mini",
348
- messages=[
349
- {"role": "system", "content": "Return only SQL or NOT_ANSWERABLE"},
350
- {"role": "user", "content": prompt}
351
- ],
352
- temperature=0
353
- )
354
- if not res.choices or not res.choices[0].message.content:
355
- raise ValueError("Empty response from OpenAI API")
356
- return res.choices[0].message.content.strip()
357
- except Exception as e:
358
- raise ValueError(f"OpenAI API error: {str(e)}")
359
 
360
  # =========================
361
- # SQL SAFETY
362
  # =========================
363
 
364
- def sanitize_sql(sql):
365
- # Remove code fence markers but preserve legitimate SQL
366
- sql = sql.replace("```sql", "").replace("```", "").strip()
367
- # Remove leading/trailing markdown code markers
368
- if sql.startswith("sql"):
369
- sql = sql[3:].strip()
370
- sql = sql.split(";")[0]
371
- return sql.replace("\n", " ").strip()
372
-
373
- def correct_table_names(sql):
374
- schema = load_ai_schema()
375
- valid_tables = {t.lower() for t in schema.keys()}
376
-
377
- table_corrections = {
378
- "visit": "admissions",
379
- "visits": "admissions",
380
- "provider": "caregiver",
381
- "providers": "caregiver"
382
- }
383
-
384
- def replace_table(match):
385
- keyword = match.group(1)
386
- table = match.group(2)
387
- table_l = table.lower()
388
 
389
- if table_l in valid_tables:
390
- return match.group(0)
391
 
392
- if table_l in table_corrections:
393
- corrected = table_corrections[table_l]
394
- if corrected in valid_tables:
395
- return f"{keyword} {corrected}"
396
 
397
- return match.group(0)
398
 
399
- pattern = re.compile(
400
- r"\b(from|join)\s+([a-zA-Z_][a-zA-Z0-9_]*)",
401
- re.IGNORECASE
402
- )
403
 
404
- return pattern.sub(replace_table, sql)
 
 
 
405
 
 
 
 
 
406
 
 
 
 
 
407
 
408
- def validate_sql(sql):
409
- sql_l = sql.lower().strip()
410
 
411
- # Must be SELECT
412
- if not sql_l.startswith("select"):
413
- raise ValueError("Only SELECT statements are allowed")
414
 
415
- # Block dangerous keywords
416
- forbidden = ["insert", "update", "delete", "drop", "alter", "truncate"]
417
- if any(word in sql_l for word in forbidden):
418
- raise ValueError("Unsafe SQL detected")
419
 
420
- # Block multiple statements
421
- if ";" in sql_l[:-1]:
422
- raise ValueError("Multiple SQL statements are not allowed")
 
 
 
 
 
423
 
424
- # JOIN must have ON
425
- if " join " in sql_l and " on " not in sql_l:
426
- raise ValueError("JOIN without ON condition is not allowed")
427
 
428
- # Prevent SELECT *
429
- if "select *" in sql_l:
430
- raise ValueError("SELECT * is not allowed")
431
 
432
- # Enforce LIMIT
433
- if "limit" not in sql_l:
434
- sql += " LIMIT 100"
435
 
436
- return sql
 
437
 
438
- def explain_sql(sql):
439
- return {
440
- "type": "aggregation" if "count(" in sql else "selection",
441
- "has_join": "join" in sql.lower(),
442
- "has_filter": "where" in sql.lower()
443
- }
444
 
445
- # =========================
446
- # PATIENT SUMMARY
447
- # =========================
448
 
449
- def validate_identifier(name):
450
- """Validate that identifier is safe (only alphanumeric and underscores)."""
451
- if not name or not isinstance(name, str):
452
- return False
453
- # Check for SQL injection attempts
454
- forbidden = [";", "--", "/*", "*/", "'", '"', "`", "(", ")", " ", "\n", "\t"]
455
- if any(char in name for char in forbidden):
456
- return False
457
- # Must start with letter or underscore, rest alphanumeric/underscore
458
- return bool(re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name))
459
 
460
  # =========================
461
- # MAIN ENGINE
462
  # =========================
463
 
464
- def process_question(question):
465
- question = correct_spelling(question)
466
-
467
- # 1️⃣ Metadata requests
468
- if any(x in question.lower() for x in ["what data", "what tables"]):
469
- return {
470
- "status": "ok",
471
- "message": describe_schema()
472
- }
473
-
474
- # 2️⃣ Build LLM prompt
475
- try:
476
- prompt = build_prompt(question)
477
- except Exception as e:
478
- return {
479
- "status": "error",
480
- "message": str(e)
481
- }
482
-
483
- # 3️⃣ Generate SQL
484
- try:
485
- sql = call_llm(prompt)
486
- except Exception as e:
487
- return {
488
- "status": "error",
489
- "message": str(e)
490
- }
491
-
492
- if sql == "NOT_ANSWERABLE":
493
- return {
494
- "status": "ok",
495
- "message": "I don't have enough data to answer that."
496
- }
497
-
498
- # 4️⃣ Sanitize & validate
499
- try:
500
- sql = sanitize_sql(sql)
501
- sql = correct_table_names(sql)
502
- sql = validate_sql(sql)
503
- sql_info = explain_sql(sql)
504
- except Exception as e:
505
- return {
506
- "status": "error",
507
- "message": str(e)
508
- }
509
-
510
- # 5️⃣ Return SQL only (no execution)
511
  return {
512
- "status": "ok",
513
- "message": humanize(
514
- "Here’s the SQL query I generated based on your question 😊"
515
- ),
516
- "sql": sql,
517
- "sql_info": sql_info
518
- }
 
1
+ import json
2
  import os
3
+ from functools import lru_cache
4
  from openai import OpenAI
 
5
  from datetime import datetime
6
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  # =========================
8
+ # CONFIG
9
  # =========================
10
 
11
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
 
 
 
 
12
 
13
  # =========================
14
+ # METADATA LOADING
15
  # =========================
16
 
17
+ @lru_cache(maxsize=1)
18
+ def load_metadata():
19
+ with open("modules.json") as f:
20
+ modules = json.load(f)
21
 
22
+ with open("join_graph.json") as f:
23
+ joins = json.load(f)
 
24
 
25
+ with open("field_types.json") as f:
26
+ field_types = json.load(f)
27
 
28
+ with open("fields.json") as f:
29
+ fields = json.load(f)
30
 
31
+ return {
32
+ "modules": modules,
33
+ "joins": joins,
34
+ "field_types": field_types,
35
+ "fields": fields
36
+ }
37
 
 
 
38
 
39
+ def resolve_operator(op, value):
40
+ mapping = {
41
+ "equals": "=",
42
+ "not_equals": "!=",
43
+ "greater_than": ">",
44
+ "less_than": "<",
45
+ "greater_or_equal": ">=",
46
+ "less_or_equal": "<=",
47
+ "contains": "LIKE",
48
+ "starts_with": "LIKE",
49
+ "ends_with": "LIKE",
50
+ "in": "IN",
51
+ "not_in": "NOT IN"
52
+ }
53
 
54
+ if op not in mapping:
55
+ raise ValueError(f"Unsupported operator: {op}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ sql_op = mapping[op]
 
 
 
 
 
 
 
 
 
58
 
59
+ if op == "contains":
60
+ return sql_op, f"'%{value}%'"
61
+ if op == "starts_with":
62
+ return sql_op, f"'{value}%'"
63
+ if op == "ends_with":
64
+ return sql_op, f"'%{value}'"
65
+ if op in ("in", "not_in"):
66
+ if not isinstance(value, list):
67
+ raise ValueError("IN operator requires list")
68
+ return sql_op, f"({','.join(map(repr, value))})"
69
+
70
+ return sql_op, f"'{value}'"
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  # =========================
74
+ # JOIN RESOLUTION
75
  # =========================
76
 
77
+ def resolve_join_path(start_table, end_table):
78
+ joins = load_metadata()["joins"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ for path in joins.values():
81
+ if path["start_table"] == start_table and path["end_table"] == end_table:
82
+ return path["steps"]
83
 
84
+ raise ValueError(
85
+ f"No join path found from {start_table} to {end_table}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  )
87
 
 
88
 
89
+ def resolve_field(field_name, module):
90
+ meta = load_metadata()
91
+ fields = meta["fields"]
92
 
93
+ if field_name not in fields:
94
+ raise ValueError(f"Unknown field: {field_name}")
 
95
 
96
+ field = fields[field_name]
 
 
97
 
98
+ if field["module"] != module:
99
+ raise ValueError(
100
+ f"Field '{field_name}' does not belong to module '{module}'"
101
+ )
 
102
 
103
+ if "table" not in field or "column" not in field:
104
  raise ValueError(
105
+ f"Field '{field_name}' is missing table/column mapping"
 
 
106
  )
107
 
108
+ return field
109
 
 
 
 
 
 
110
 
111
+ def build_join_sql(base_table, steps):
112
+ sql = []
113
+ prev_alias = base_table # alias == table name
114
 
115
+ for step in steps:
116
+ alias = step["alias"]
117
+ sql.append(
118
+ f"{step['join_type'].upper()} JOIN {step['table']} {alias} "
119
+ f"ON {prev_alias}.{step['base_column']} = {alias}.{step['foreign_column']}"
120
+ )
121
+ prev_alias = alias
 
 
 
 
 
122
 
123
+ return "\n".join(sql)
124
 
 
 
125
 
126
+ # =========================
127
+ # INTENT PARSING (LLM)
128
+ # =========================
129
+
130
+ def parse_intent(question):
131
+ prompt = f"""
132
+ You are a query planner.
133
 
134
+ Extract:
135
+ - module
136
+ - filters (field, operator, value)
137
+ - selected fields (list of fields)
138
 
139
+ Return JSON only.
 
 
140
 
141
+ Example:
142
+ {{
143
+ "module": "employees",
144
+ "filters": [
145
+ {{ "field": "department", "operator": "equals", "value": "IT" }}
146
+ ]
147
+ }}
148
 
149
+ User question:
150
+ {question}
 
 
 
 
151
  """
152
 
153
+ res = client.chat.completions.create(
154
+ model="gpt-4.1-mini",
155
+ messages=[{"role": "user", "content": prompt}],
156
+ temperature=0
157
+ )
158
+
159
+ return json.loads(res.choices[0].message.content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
  # =========================
162
+ # SQL GENERATOR
163
  # =========================
164
 
165
+ def build_sql(plan):
166
+ meta = load_metadata()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ module = plan["module"]
 
169
 
170
+ if module not in meta["modules"]:
171
+ raise ValueError(f"Unknown module: {module}")
 
 
172
 
173
+ base_table = meta["modules"][module]["base_table"]
174
 
175
+ joins = []
176
+ joined_tables = set()
177
+ where_clauses = []
 
178
 
179
+ for f in plan.get("filters", []):
180
+ field_name = f["field"]
181
+ operator = f["operator"]
182
+ value = f["value"]
183
 
184
+ # Resolve field metadata
185
+ field = resolve_field(field_name, module)
186
+ table = field["table"]
187
+ column = field["column"]
188
 
189
+ # Handle JOIN only once
190
+ if table != base_table and table not in joined_tables:
191
+ join_steps = resolve_join_path(base_table, table)
192
+ join_sql = build_join_sql(base_table, join_steps)
193
 
194
+ joins.append(join_sql)
195
+ joined_tables.add(table)
196
 
197
+ # Operator resolution
198
+ sql_op, sql_value = resolve_operator(operator, value)
 
199
 
200
+ where_clauses.append(
201
+ f"{table}.{column} {sql_op} {sql_value}"
202
+ )
 
203
 
204
+ # Final SQL
205
+ sql = f"""
206
+ SELECT {base_table}.*
207
+ FROM {base_table}
208
+ {' '.join(joins)}
209
+ WHERE {' AND '.join(where_clauses)}
210
+ LIMIT 100
211
+ """
212
 
213
+ return sql.strip()
 
 
214
 
 
 
 
215
 
216
+ # =========================
217
+ # VALIDATION
218
+ # =========================
219
 
220
+ def validate_sql(sql):
221
+ sql = sql.lower()
222
 
223
+ if not sql.startswith("select"):
224
+ raise ValueError("Only SELECT allowed")
 
 
 
 
225
 
226
+ forbidden = ["drop", "delete", "update", "insert", "truncate"]
227
+ if any(x in sql for x in forbidden):
228
+ raise ValueError("Unsafe SQL")
229
 
230
+ return sql
 
 
 
 
 
 
 
 
 
231
 
232
  # =========================
233
+ # MAIN ENTRY POINT
234
  # =========================
235
 
236
+ def run(question):
237
+ plan = parse_intent(question)
238
+ sql = build_sql(plan)
239
+ sql = validate_sql(sql)
240
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  return {
242
+ "query_plan": plan,
243
+ "sql": sql
244
+ }