Spaces:

bhavika24
/

Text_to_sql

Sleeping

App Files Files Community

bhavika24 commited on Jan 23

Commit

f3ebab8

verified ·

1 Parent(s): 31318b4

Upload engine.py

Browse files

Files changed (1) hide show

engine.py +13 -195

engine.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import re
-import sqlite3
 from openai import OpenAI
 from difflib import get_close_matches
 from datetime import datetime
@@ -26,7 +25,6 @@ api_key = os.getenv("OPENAI_API_KEY")
 if not api_key:
     raise ValueError("OPENAI_API_KEY environment variable is not set")
 client = OpenAI(api_key=api_key)
-conn = sqlite3.connect("mimic_iv.db", check_same_thread=False)
 # =========================
@@ -44,14 +42,6 @@ def humanize(text):
     return f"Sure \n\n{text}"
 def friendly(text):
-    global LAST_SUGGESTED_DATE
-    if LAST_SUGGESTED_DATE:
-        return f"{text}\n\nLast data available is {LAST_SUGGESTED_DATE}"
-    else:
-        # If date not set yet, try to get it
-        date = get_latest_data_date()
-        if date:
-            return f"{text}\n\nLast data available is {date}"
         return text
 def is_confirmation(text):
@@ -271,47 +261,6 @@ def describe_schema(max_tables=10):#what data you have or which table exist
 # TIME HANDLING
 # =========================
-def get_latest_data_date():
-    """
-    Returns the most meaningful 'latest date' for the system.
-    Priority:
-    1. admissions.admittime
-    2. icustays.intime
-    3. chartevents.charttime
-    """
-    checks = [
-        ("admissions", "admittime"),
-        ("icustays", "intime"),
-        ("chartevents", "charttime"),
-    ]
-    for table, column in checks:
-        try:
-            result = conn.execute(
-                f"SELECT MAX({column}) FROM {table}"
-            ).fetchone()
-            if result and result[0]:
-                return result[0]
-        except Exception:
-            continue
-    return None
-def normalize_time_question(q):#total-actual date
-    latest = get_latest_data_date()
-    if not latest:
-        return q
-    if "today" in q:
-        return q.replace("today", f"on {latest[:10]}")
-    if "yesterday" in q:
-        return q.replace("yesterday", f"on {latest[:10]}")
-    return q
 # =========================
 # SQL GENERATION
 # =========================
@@ -493,49 +442,6 @@ def explain_sql(sql):
         "has_filter": "where" in sql.lower()
     }
-def run_query(sql):
-    """Execute SQL query safely with validation and limits."""
-    cur = conn.cursor()
-    try:
-        # 1️⃣ Validate query plan
-        cur.execute("EXPLAIN QUERY PLAN " + sql)
-        plan = cur.fetchall()
-        for row in plan:
-            detail = row[-1].lower()
-            if "scan" in detail and "using index" not in detail:
-                raise ValueError("Query rejected: full table scan detected")
-        # 2️⃣ Execute query
-        cur.execute(sql)
-        rows = cur.fetchall()
-        # ✅ 3️⃣ Guard against inflated COUNT results
-        if "count(" in sql.lower() and "group by" not in sql.lower():
-            if len(rows) == 1 and isinstance(rows[0][0], (int, float)):
-                if rows[0][0] > 10_000_000:
-                    raise ValueError(
-                        "Suspiciously large count — possible join duplication"
-                    )
-        # 4️⃣ Limit result size
-        MAX_ROWS = 1000
-        if len(rows) > MAX_ROWS:
-            rows = rows[:MAX_ROWS]
-        # 5️⃣ Extract columns
-        cols = [c[0] for c in cur.description] if cur.description else []
-        return cols, rows
-    except sqlite3.Error as e:
-        raise ValueError(f"Database query error: {str(e)}")
-    finally:
-        cur.close()
 # =========================
 # PATIENT SUMMARY
 # =========================
@@ -551,85 +457,18 @@ def validate_identifier(name):
     # Must start with letter or underscore, rest alphanumeric/underscore
     return bool(re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name))
-def build_table_summary(table_name):
-    """Build summary for a table using metadata."""
-    # Validate table name against metadata first
-    schema = load_ai_schema()
-    if table_name not in schema:
-        return f"Table {table_name} not found in metadata."
-    # Additional safety check
-    if not validate_identifier(table_name):
-        return f"Invalid table name: {table_name}"
-    cur = conn.cursor()
-    # Total rows (still need to query actual data for count)
-    # Note: SQLite doesn't support parameterized table names
-    # Since we validated table_name against metadata, it's safe
-    try:
-        total = cur.execute(
-            f"SELECT COUNT(*) FROM {table_name}"
-        ).fetchone()[0]
-    except sqlite3.Error as e:
-        return f"Error querying table {table_name}: {str(e)}"
-    columns = schema[table_name]["columns"]  # {col_name: description, ...}
-    summary = f"Here's a summary of **{table_name}**:\n\n"
-    summary += f"• Total records: {total}\n"
-    # Try to summarize categorical columns using metadata
-    for col_name, col_desc in columns.items():
-        # Validate column name
-        if not validate_identifier(col_name):
-            continue
-        # Try to determine if it's a categorical column based on name/description
-        # Skip likely numeric/date columns
-        col_lower = col_name.lower()
-        if any(skip in col_lower for skip in ["id", "_id", "date", "time", "count", "amount", "price"]):
-            continue
-        # Try to get breakdown for text-like columns
-        try:
-            # Note: SQLite doesn't support parameterized identifiers, so we validate
-            rows = cur.execute(
-                f"""
-                SELECT {col_name}, COUNT(*)
-                FROM {table_name}
-                GROUP BY {col_name}
-                ORDER BY COUNT(*) DESC
-                LIMIT 5
-                """
-            ).fetchall()
-            if rows:
-                summary += f"\n• {col_name.capitalize()} breakdown:\n"
-                for val, count in rows:
-                    summary += f"  - {val}: {count}\n"
-        except (sqlite3.Error, sqlite3.OperationalError) as e:
-            # Ignore columns that can't be grouped (likely not categorical)
-            pass
-    summary += "\nYou can ask more detailed questions about this data."
-    return summary
 # =========================
 # MAIN ENGINE
 # =========================
 def process_question(question):
     question = correct_spelling(question)
-    question = normalize_time_question(question)
     # 1️⃣ Metadata requests
     if any(x in question.lower() for x in ["what data", "what tables"]):
         return {
             "status": "ok",
-            "message": describe_schema(),
-            "data": []
         }
     # 2️⃣ Build LLM prompt
@@ -638,8 +477,7 @@ def process_question(question):
     except Exception as e:
         return {
             "status": "error",
-            "message": str(e),
-            "data": []
         }
     # 3️⃣ Generate SQL
@@ -648,15 +486,13 @@ def process_question(question):
     except Exception as e:
         return {
             "status": "error",
-            "message": str(e),
-            "data": []
         }
     if sql == "NOT_ANSWERABLE":
         return {
             "status": "ok",
-            "message": "I don't have enough data to answer that.",
-            "data": []
         }
     # 4️⃣ Sanitize & validate
@@ -665,36 +501,18 @@ def process_question(question):
         sql = correct_table_names(sql)
         sql = validate_sql(sql)
         sql_info = explain_sql(sql)
-    except Exception as e:
-        return {
-            "status": "error",
-            "message": str(e),
-            "data": []
-        }
-    # 5️⃣ Execute
-    try:
-        cols, rows = run_query(sql)
     except Exception as e:
         return {
             "status": "error",
-            "message": str(e),
-            "data": []
         }
-    # 6️⃣ Log
-    log_interaction(
-        user_q=question,
-        sql=sql,
-        result=rows[:10]
-    )
-    # 7️⃣ Return
     return {
-        "status": "ok",
-        "sql": sql,
-        "sql_info": sql_info,
-        "columns": cols,
-        "data": rows
-    }

 import os
 import re
 from openai import OpenAI
 from difflib import get_close_matches
 from datetime import datetime
 if not api_key:
     raise ValueError("OPENAI_API_KEY environment variable is not set")
 client = OpenAI(api_key=api_key)
 # =========================
     return f"Sure \n\n{text}"
 def friendly(text):
         return text
 def is_confirmation(text):
 # TIME HANDLING
 # =========================
 # =========================
 # SQL GENERATION
 # =========================
         "has_filter": "where" in sql.lower()
     }
 # =========================
 # PATIENT SUMMARY
 # =========================
     # Must start with letter or underscore, rest alphanumeric/underscore
     return bool(re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name))
 # =========================
 # MAIN ENGINE
 # =========================
 def process_question(question):
     question = correct_spelling(question)
     # 1️⃣ Metadata requests
     if any(x in question.lower() for x in ["what data", "what tables"]):
         return {
             "status": "ok",
+            "message": describe_schema()
         }
     # 2️⃣ Build LLM prompt
     except Exception as e:
         return {
             "status": "error",
+            "message": str(e)
         }
     # 3️⃣ Generate SQL
     except Exception as e:
         return {
             "status": "error",
+            "message": str(e)
         }
     if sql == "NOT_ANSWERABLE":
         return {
             "status": "ok",
+            "message": "I don't have enough data to answer that."
         }
     # 4️⃣ Sanitize & validate
         sql = correct_table_names(sql)
         sql = validate_sql(sql)
         sql_info = explain_sql(sql)
     except Exception as e:
         return {
             "status": "error",
+            "message": str(e)
         }
+    # 5️⃣ Return SQL only (no execution)
     return {
+    "status": "ok",
+    "message": humanize(
+        "Here’s the SQL query I generated based on your question 😊"
+    ),
+    "sql": sql,
+    "sql_info": sql_info
+}