Spaces:

bhavika24
/

Text_to_sql

Sleeping

App Files Files Community

bhavika24 commited on Jan 23

Commit

31318b4

verified ·

1 Parent(s): 484dcc5

Upload engine.py

Browse files

Files changed (1) hide show

engine.py +106 -294

engine.py CHANGED Viewed

@@ -17,8 +17,6 @@ def log_interaction(user_q, sql=None, result=None, error=None):
         "error": error
     })
 # =========================
 # SETUP
 # =========================
@@ -38,8 +36,6 @@ conn = sqlite3.connect("mimic_iv.db", check_same_thread=False)
 LAST_PROMPT_TYPE = None
 LAST_SUGGESTED_DATE = None
 # =========================
 # HUMAN RESPONSE HELPERS
 # =========================
@@ -87,8 +83,6 @@ def correct_spelling(q):
         fixed.append(match[0] if match else clean)
     return " ".join(fixed)
 # =========================
 # SCHEMA
 # =========================
@@ -174,8 +168,6 @@ def extract_relevant_tables(question, max_tables=4):
     "vital": ["chartevents"],
     "vitals": ["chartevents"],
 }
     # Only include hints for tables that exist in the schema
     for intent, possible_tables in hint_mappings.items():
         matching_tables = [t for t in possible_tables if t in table_names_lower]
@@ -238,7 +230,6 @@ def extract_relevant_tables(question, max_tables=4):
     return [t[0] for t in matched[:max_tables]]
 # =========================
 # HUMAN SCHEMA DESCRIPTION
 # =========================
@@ -308,8 +299,6 @@ def get_latest_data_date():
     return None
 def normalize_time_question(q):#total-actual date
     latest = get_latest_data_date()
     if not latest:
@@ -323,50 +312,6 @@ def normalize_time_question(q):#total-actual date
     return q
-# =========================
-# UNSUPPORTED QUESTIONS
-# =========================
-def is_question_supported(question):
-    q = question.lower()
-    # 1️⃣ Always allow analytical / time-based queries
-    analytic_keywords = [
-        "count", "total", "average", "avg", "sum",
-        "how many", "number of",
-        "trend", "increase", "decrease", "compare",
-        "last", "latest", "recent", "past",
-        "day", "days", "month", "year"
-    ]
-    if any(k in q for k in analytic_keywords):
-        return True
-    # 2️⃣ Schema-based relevance check
-    schema = load_ai_schema()
-    for table, meta in schema.items():
-        table_l = table.lower()
-        # Table name mentioned
-        if table_l in q:
-            return True
-        # Column or description match
-        for col, desc in meta["columns"].items():
-            if col.lower() in q:
-                return True
-            if isinstance(desc, str) and any(
-                word in desc.lower() for word in q.split()
-            ):
-                return True
-    return False
 # =========================
 # SQL GENERATION
 # =========================
@@ -406,8 +351,7 @@ STRICT RULES:
 - Do NOT wrap SQL in markdown
 - Use explicit JOIN conditions
 - Prefer COUNT(*) for totals
-Always use these joins:
 - patients.subject_id = admissions.subject_id
 - admissions.hadm_id = icustays.hadm_id
 - icustays.stay_id = chartevents.stay_id
@@ -447,8 +391,6 @@ Join hints:
     return prompt
 def call_llm(prompt):
     """Call OpenAI API with error handling."""
     try:
@@ -515,65 +457,85 @@ def correct_table_names(sql):
 def validate_sql(sql):
-    if " join " in sql.lower() and " on " not in sql.lower():
-        raise ValueError("JOIN without ON condition is not allowed")
-    if ";" in sql.strip()[:-1]:
-        raise ValueError("Multiple SQL statements are not allowed")
-    FORBIDDEN = ["insert", "update", "delete", "drop", "alter"]
-    if any(k in sql.lower() for k in FORBIDDEN):
         raise ValueError("Unsafe SQL detected")
-    if not sql.lower().startswith("select"):
-        raise ValueError("Only SELECT allowed")
     return sql
 def run_query(sql):
-    """Execute SQL query with proper error handling."""
     cur = conn.cursor()
     try:
-        rows = cur.execute(sql).fetchall()
-        if cur.description:
-            cols = [c[0] for c in cur.description]
-        else:
-            cols = []
         return cols, rows
     except sqlite3.Error as e:
         raise ValueError(f"Database query error: {str(e)}")
-# =========================
-# AGGREGATE SAFETY
-# =========================
-def is_aggregate_only_query(sql):
-    s = sql.lower()
-    return (
-    any(fn in s for fn in ["count(", "sum(", "avg("])
-    and "group by" not in s
-    and "over(" not in s
-)
-def has_underlying_data(sql):
-    """Check if underlying data exists for the SQL query."""
-    base = sql.lower()
-    if "from" not in base:
-        return False
-    base = base.split("from", 1)[1]
-    # Split at GROUP BY, ORDER BY, LIMIT, etc. to get just the FROM clause
-    for clause in ["group by", "order by", "limit", "having"]:
-        base = base.split(clause)[0]
-    test_sql = "SELECT 1 FROM " + base.strip() + " LIMIT 1"
-    cur = conn.cursor()
-    try:
-        return cur.execute(test_sql).fetchone() is not None
-    except sqlite3.Error:
-        return False
 # =========================
 # PATIENT SUMMARY
 # =========================
@@ -654,235 +616,85 @@ def build_table_summary(table_name):
     return summary
 # =========================
 # MAIN ENGINE
 # =========================
 def process_question(question):
-    global LAST_PROMPT_TYPE, LAST_SUGGESTED_DATE
-    q = question.strip().lower()
-    # ----------------------------------
-    # Normalize first
-    # ----------------------------------
     question = correct_spelling(question)
     question = normalize_time_question(question)
-    LAST_PROMPT_TYPE = None
-    LAST_SUGGESTED_DATE = None
-    # ----------------------------------
-    # Handle "data updated till"
-    # ----------------------------------
-    if any(x in q for x in ["updated", "upto", "up to", "latest data"]):
         return {
             "status": "ok",
-            "message": f"Data is available up to {get_latest_data_date()}",
             "data": []
         }
-    # ----------------------------------
-    # Extract relevant tables
-    # ----------------------------------
-    matched_tables = extract_relevant_tables(question)
-    # ----------------------------------
-    # SUMMARY ONLY IF USER ASKS FOR IT
-    # ----------------------------------
-    if (
-    len(matched_tables) == 1
-    and any(k in q for k in ["summary", "overview", "describe"])
-    and not any(k in q for k in ["count", "total", "how many", "average"])
-):
-        return {
-        "status": "ok",
-        "message": build_table_summary(matched_tables[0]),
-        "data": []
-    }
-    # Only block if too many tables matched AND it's not an analytical question
-    # Analytical questions (how many, count, etc.) often need multiple tables
-    is_analytical = any(k in q for k in [
-        "how many", "count", "total", "number of",
-        "average", "avg", "sum", "more than", "less than",
-        "compare", "trend"
-    ])
-    if len(matched_tables) > 4 and not is_analytical:
-        return {
-        "status": "ok",
-        "message": (
-            "Your question matches too many datasets:\n"
-            + "\n".join(f"- {t}" for t in matched_tables[:5])
-            + "\n\nPlease be more specific about what you want to know."
-        ),
-        "data": []
-    }
-    # ----------------------------------
-    # Metadata discovery
-    # ----------------------------------
-    if any(x in q for x in ["what data", "what tables", "which data"]):
         return {
-            "status": "ok",
-            "message": humanize(describe_schema()),
             "data": []
         }
-    # ----------------------------------
-    # # LAST DATA / RECENT DATA HANDLING
-    # # ----------------------------------
-    if any(x in q for x in ["last data", "latest data"]):
-        return {
-        "status": "ok",
-        "message": f"Latest data available is from {get_latest_data_date()}",
-        "data": []
-    }
-    if "last" in q and "day" in q and ("visit" in q or "admission" in q):
-        sql = """
-    SELECT subject_id, admittime
-    FROM admissions
-    WHERE admittime >= date(
-        (SELECT MAX(admittime) FROM admissions),
-        '-30 days'
-    )
-    ORDER BY admittime DESC
-    """
-    cols, rows = run_query(sql)
-    log_interaction(
-        user_q=question,
-        sql=sql,
-        result=rows
-    )
-    return {
-        "status": "ok",
-        "sql": sql,
-        "columns": cols,
-        "data": rows
-    }
-    # ----------------------------------
-    # Unsupported question check
-    # ----------------------------------
-    if not is_question_supported(question):
-        log_interaction(
-        user_q=question,
-        error="Unsupported question"
-    )
-        return {
-        "status": "ok",
-        "message": (
-            "That information isn’t available in the system.\n\n"
-            "You can ask about:\n"
-            "• Patients\n"
-            "• Admissions / Visits\n"
-            "• ICU stays\n"
-            "• Diagnoses / Conditions\n"
-            "• Vitals & lab measurements"
-        ),
-        "data": []
-    }
-    # ----------------------------------
-    # Generate SQL
-    # ----------------------------------
     try:
-        sql = call_llm(build_prompt(question))
-    except ValueError as e:
-        log_interaction(
-        user_q=question,
-        error=str(e)
-    )
-    return {
-        "status": "ok",
-        "message": str(e),
-        "data": []
-    }
-    if sql == "NOT_ANSWERABLE":
         return {
-            "status": "ok",
-            "message": "I don't have enough data to answer that.",
             "data": []
         }
-    # Sanitize, correct table names, then validate
-    sql = sanitize_sql(sql)
-    sql = correct_table_names(sql)
-    sql = validate_sql(sql)
-    cols, rows = run_query(sql)
-    # ✅ LOG ONCE (THIS FIXES YOUR DOWNLOAD ISSUE)
-    log_interaction(
-        user_q=question,
-        sql=sql,
-        result=rows
-    )
-    if not rows:
         return {
             "status": "ok",
-            "message": friendly("No records found."),
             "data": []
         }
-    return {
-        "status": "ok",
-        "sql": sql,
-        "columns": cols,
-        "data": rows
-    }
-    # ----------------------------------
-    # No data handling
-    # ----------------------------------
-    if is_aggregate_only_query(sql) and not has_underlying_data(sql):
-        LAST_PROMPT_TYPE = "NO_DATA"
-        LAST_SUGGESTED_DATE = get_latest_data_date()
         return {
-            "status": "ok",
-            "message": friendly("No data is available for that time period."),
-            "note": f"Available data is only up to {LAST_SUGGESTED_DATE}.",
             "data": []
         }
-    if not rows:
-        log_interaction(
-    user_q=question,
-    sql=sql,
-    result=[]
-)
-        LAST_PROMPT_TYPE = "NO_DATA"
-        LAST_SUGGESTED_DATE = get_latest_data_date()
         return {
-            "status": "ok",
-            "message": friendly("No records found."),
-            "note": f"Available data is only up to {LAST_SUGGESTED_DATE}.",
             "data": []
         }
-    # ----------------------------------
-    # Success
-    # ----------------------------------
     return {
         "status": "ok",
         "sql": sql,
         "columns": cols,
         "data": rows
     }

         "error": error
     })
 # =========================
 # SETUP
 # =========================
 LAST_PROMPT_TYPE = None
 LAST_SUGGESTED_DATE = None
 # =========================
 # HUMAN RESPONSE HELPERS
 # =========================
         fixed.append(match[0] if match else clean)
     return " ".join(fixed)
 # =========================
 # SCHEMA
 # =========================
     "vital": ["chartevents"],
     "vitals": ["chartevents"],
 }
     # Only include hints for tables that exist in the schema
     for intent, possible_tables in hint_mappings.items():
         matching_tables = [t for t in possible_tables if t in table_names_lower]
     return [t[0] for t in matched[:max_tables]]
 # =========================
 # HUMAN SCHEMA DESCRIPTION
 # =========================
     return None
 def normalize_time_question(q):#total-actual date
     latest = get_latest_data_date()
     if not latest:
     return q
 # =========================
 # SQL GENERATION
 # =========================
 - Do NOT wrap SQL in markdown
 - Use explicit JOIN conditions
 - Prefer COUNT(*) for totals
+- Use these joins only if columns from both tables are required.
 - patients.subject_id = admissions.subject_id
 - admissions.hadm_id = icustays.hadm_id
 - icustays.stay_id = chartevents.stay_id
     return prompt
 def call_llm(prompt):
     """Call OpenAI API with error handling."""
     try:
 def validate_sql(sql):
+    sql_l = sql.lower().strip()
+    # Must be SELECT
+    if not sql_l.startswith("select"):
+        raise ValueError("Only SELECT statements are allowed")
+    # Block dangerous keywords
+    forbidden = ["insert", "update", "delete", "drop", "alter", "truncate"]
+    if any(word in sql_l for word in forbidden):
         raise ValueError("Unsafe SQL detected")
+    # Block multiple statements
+    if ";" in sql_l[:-1]:
+        raise ValueError("Multiple SQL statements are not allowed")
+    # JOIN must have ON
+    if " join " in sql_l and " on " not in sql_l:
+        raise ValueError("JOIN without ON condition is not allowed")
+    # Prevent SELECT *
+    if "select *" in sql_l:
+        raise ValueError("SELECT * is not allowed")
+    # Enforce LIMIT
+    if "limit" not in sql_l:
+        sql += " LIMIT 100"
     return sql
+def explain_sql(sql):
+    return {
+        "type": "aggregation" if "count(" in sql else "selection",
+        "has_join": "join" in sql.lower(),
+        "has_filter": "where" in sql.lower()
+    }
 def run_query(sql):
+    """Execute SQL query safely with validation and limits."""
     cur = conn.cursor()
     try:
+        # 1️⃣ Validate query plan
+        cur.execute("EXPLAIN QUERY PLAN " + sql)
+        plan = cur.fetchall()
+        for row in plan:
+            detail = row[-1].lower()
+            if "scan" in detail and "using index" not in detail:
+                raise ValueError("Query rejected: full table scan detected")
+        # 2️⃣ Execute query
+        cur.execute(sql)
+        rows = cur.fetchall()
+        # ✅ 3️⃣ Guard against inflated COUNT results
+        if "count(" in sql.lower() and "group by" not in sql.lower():
+            if len(rows) == 1 and isinstance(rows[0][0], (int, float)):
+                if rows[0][0] > 10_000_000:
+                    raise ValueError(
+                        "Suspiciously large count — possible join duplication"
+                    )
+        # 4️⃣ Limit result size
+        MAX_ROWS = 1000
+        if len(rows) > MAX_ROWS:
+            rows = rows[:MAX_ROWS]
+        # 5️⃣ Extract columns
+        cols = [c[0] for c in cur.description] if cur.description else []
         return cols, rows
     except sqlite3.Error as e:
         raise ValueError(f"Database query error: {str(e)}")
+    finally:
+        cur.close()
 # =========================
 # PATIENT SUMMARY
 # =========================
     return summary
 # =========================
 # MAIN ENGINE
 # =========================
 def process_question(question):
     question = correct_spelling(question)
     question = normalize_time_question(question)
+    # 1️⃣ Metadata requests
+    if any(x in question.lower() for x in ["what data", "what tables"]):
         return {
             "status": "ok",
+            "message": describe_schema(),
             "data": []
         }
+    # 2️⃣ Build LLM prompt
+    try:
+        prompt = build_prompt(question)
+    except Exception as e:
         return {
+            "status": "error",
+            "message": str(e),
             "data": []
         }
+    # 3️⃣ Generate SQL
     try:
+        sql = call_llm(prompt)
+    except Exception as e:
         return {
+            "status": "error",
+            "message": str(e),
             "data": []
         }
+    if sql == "NOT_ANSWERABLE":
         return {
             "status": "ok",
+            "message": "I don't have enough data to answer that.",
             "data": []
         }
+    # 4️⃣ Sanitize & validate
+    try:
+        sql = sanitize_sql(sql)
+        sql = correct_table_names(sql)
+        sql = validate_sql(sql)
+        sql_info = explain_sql(sql)
+    except Exception as e:
         return {
+            "status": "error",
+            "message": str(e),
             "data": []
         }
+    # 5️⃣ Execute
+    try:
+        cols, rows = run_query(sql)
+    except Exception as e:
         return {
+            "status": "error",
+            "message": str(e),
             "data": []
         }
+    # 6️⃣ Log
+    log_interaction(
+        user_q=question,
+        sql=sql,
+        result=rows[:10]
+    )
+    # 7️⃣ Return
     return {
         "status": "ok",
         "sql": sql,
+        "sql_info": sql_info,
         "columns": cols,
         "data": rows
     }