Spaces:

bhavika24
/

Text_to_sql

Sleeping

App Files Files Community

bhavika24 commited on 10 days ago

Commit

bfa0b78

verified ·

1 Parent(s): 1932ae9

Upload engine.py

Browse files

Files changed (1) hide show

engine.py +465 -429

engine.py CHANGED Viewed

@@ -1,429 +1,465 @@
-import os
-import sqlite3
-from openai import OpenAI
-from difflib import get_close_matches
-from datetime import datetime
-# =========================
-# SETUP
-# =========================
-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-conn = sqlite3.connect("hospital.db", check_same_thread=False)
-# =========================
-# CONVERSATION STATE
-# =========================
-LAST_PROMPT_TYPE = None
-LAST_SUGGESTED_DATE = None
-# =========================
-# HUMAN RESPONSE HELPERS
-# =========================
-def humanize(text):
-    return f"Sure 🙂\n\n{text}"
-def friendly(text):
-    return f"{text}\n\nIf you want, I can help you explore this further 🙂"
-def is_confirmation(text):
-    return text.strip().lower() in ["yes", "yep", "yeah", "ok", "okay", "sure"]
-def is_why_question(text):
-    return text.strip().lower().startswith("why")
-# =========================
-# SPELL CORRECTION
-# =========================
-KNOWN_TERMS = [
-    "patient", "patients", "condition", "conditions",
-    "encounter", "encounters", "visit", "visits",
-    "medication", "medications",
-    "admitted", "admission",
-    "year", "month", "last", "recent", "today"
-]
-def correct_spelling(q):
-    words = q.split()
-    fixed = []
-    for w in words:
-        clean = w.lower().strip(",.?")
-        match = get_close_matches(clean, KNOWN_TERMS, n=1, cutoff=0.8)
-        fixed.append(match[0] if match else w)
-    return " ".join(fixed)
-# =========================
-# SCHEMA
-# =========================
-def load_ai_schema():
-    cur = conn.cursor()
-    schema = {}
-    tables = cur.execute("""
-        SELECT table_name, description
-        FROM ai_tables
-        WHERE ai_enabled = 1
-    """).fetchall()
-    for table, desc in tables:
-        cols = cur.execute("""
-            SELECT column_name, description
-            FROM ai_columns
-            WHERE table_name = ? AND ai_allowed = 1
-        """, (table,)).fetchall()
-        schema[table] = {
-            "description": desc,
-            "columns": cols
-        }
-    return schema
-# =========================
-# HUMAN SCHEMA DESCRIPTION
-# =========================
-def describe_schema():
-    schema = load_ai_schema()
-    response = "Here’s the data I currently have access to:\n\n"
-    for table, meta in schema.items():
-        response += f"• **{table.capitalize()}** — {meta['description']}\n"
-        for col, desc in meta["columns"]:
-            response += f"  - {col}: {desc}\n"
-        response += "\n"
-    response += (
-        "You can ask things like:\n"
-        "• How many patients are there?\n"
-        "• Patient count by gender\n"
-        "• Admissions by year\n\n"
-        "Just tell me what you want to explore 🙂"
-    )
-    return response
-# =========================
-# TIME HANDLING
-# =========================
-def get_latest_data_date():
-    cur = conn.cursor()
-    r = cur.execute("SELECT MAX(start_date) FROM encounters").fetchone()
-    return r[0]
-def normalize_time_question(q):
-    latest = get_latest_data_date()
-    if not latest:
-        return q
-    if "today" in q:
-        return q.replace("today", f"on {latest[:10]}")
-    if "yesterday" in q:
-        return q.replace("yesterday", f"on {latest[:10]}")
-    return q
-# =========================
-# UNSUPPORTED QUESTIONS
-# =========================
-def get_unsupported_reason(q):
-    q = q.lower()
-    if any(w in q for w in ["consultant", "doctor"]):
-        return {
-            "reason": "Doctor or consultant-level data is not available.",
-            "suggestion": "Try asking about patients, visits, or admissions."
-        }
-    if any(w in q for w in ["department", "specialization"]):
-        return {
-            "reason": "Department-level data is not stored.",
-            "suggestion": "Try patient or visit related questions."
-        }
-    return None
-# =========================
-# SQL GENERATION
-# =========================
-def build_prompt(question):
-    schema = load_ai_schema()
-    prompt = """
-You are a hospital SQL assistant.
-Rules:
-- Use only SELECT
-- SQLite syntax
-- Use only listed tables/columns
-- Return ONLY SQL or NOT_ANSWERABLE
-"""
-    for table, meta in schema.items():
-        prompt += f"\nTable: {table}\n"
-        for col, desc in meta["columns"]:
-            prompt += f"- {col}: {desc}\n"
-    prompt += f"\nQuestion: {question}\n"
-    return prompt
-def call_llm(prompt):
-    res = client.chat.completions.create(
-        model="gpt-4.1-mini",
-        messages=[
-            {"role": "system", "content": "Return only SQL or NOT_ANSWERABLE"},
-            {"role": "user", "content": prompt}
-        ],
-        temperature=0
-    )
-    return res.choices[0].message.content.strip()
-# =========================
-# SQL SAFETY
-# =========================
-def sanitize_sql(sql):
-    sql = sql.replace("```", "").replace("sql", "").strip()
-    sql = sql.split(";")[0]
-    return sql.replace("\n", " ").strip()
-def validate_sql(sql):
-    if not sql.lower().startswith("select"):
-        raise Exception("Only SELECT allowed")
-    return sql
-def run_query(sql):
-    cur = conn.cursor()
-    rows = cur.execute(sql).fetchall()
-    cols = [c[0] for c in cur.description]
-    return cols, rows
-# =========================
-# AGGREGATE SAFETY
-# =========================
-def is_aggregate_only_query(sql):
-    s = sql.lower()
-    return ("count(" in s or "sum(" in s or "avg(" in s) and "group by" not in s
-def has_underlying_data(sql):
-    base = sql.lower()
-    if "from" not in base:
-        return False
-    base = base.split("from", 1)[1]
-    test_sql = "SELECT 1 FROM " + base.split("group by")[0] + " LIMIT 1"
-    cur = conn.cursor()
-    return cur.execute(test_sql).fetchone() is not None
-def is_patient_summary_question(text):
-    t = text.lower()
-    keywords = [
-        "patient summary",
-        "patients summary",
-        "patient overview",
-        "summary of patients",
-        "give a patient summary",
-        "patient summery",
-        "patients summery",
-        "patent summary",
-        "patant summary",
-        "patient sumary",
-        "patients sumery"
-    ]
-    return any(k in t for k in keywords)
-# =========================
-# PATIENT SUMMARY
-# =========================
-def build_patient_summary():
-    cur = conn.cursor()
-    total = cur.execute(
-        "SELECT COUNT(*) FROM patients"
-    ).fetchone()[0]
-    genders = cur.execute(
-        "SELECT gender, COUNT(*) FROM patients GROUP BY gender"
-    ).fetchall()
-    msg = "Here’s a quick summary of patients:\n\n"
-    msg += f"• Total patients: {total}\n"
-    if genders:
-        msg += "• Gender distribution:\n"
-        for g, c in genders:
-            msg += f"  - {g}: {c}\n"
-    msg += (
-        "\nYou can also ask:\n"
-        "• Patients admitted by year\n"
-        "• Patient count by age\n"
-        "• Visit trends"
-    )
-    return msg
-# =========================
-# MAIN ENGINE
-# =========================
-def process_question(question):
-    global LAST_PROMPT_TYPE, LAST_SUGGESTED_DATE
-    q = question.strip().lower()
-    # -------------------------------
-    # Patient summary intent
-    # -------------------------------
-    if is_patient_summary_question(q):
-        return {
-            "status": "ok",
-            "message": build_patient_summary(),
-            "data": [],
-            "sql": None,
-            "note": None
-        }
-    # -------------------------------
-    # WHY follow-up handling
-    # -------------------------------
-    if is_why_question(q) and LAST_PROMPT_TYPE == "NO_DATA":
-        year = LAST_SUGGESTED_DATE[:4] if LAST_SUGGESTED_DATE else "the latest available year"
-        return {
-            "status": "ok",
-            "message": (
-                f"I suggested **{year}** because that’s the most recent year "
-                f"for which data exists in the system.\n\n"
-                "Your database doesn’t contain newer records yet.\n\n"
-                "You can explore:\n"
-                "• Data from 2021\n"
-                "• Trends over time\n"
-                "• Patient summaries"
-            ),
-            "data": []
-        }
-    # -------------------------------
-    # YES / confirmation handling
-    # -------------------------------
-    if is_confirmation(q) and LAST_PROMPT_TYPE == "NO_DATA":
-        return {
-            "status": "ok",
-            "message": (
-                "Great 🙂\n\n"
-                "Here are some things you can ask:\n"
-                "• How many patients were admitted in 2021?\n"
-                "• Patient count by gender\n"
-                "• Total visits by month\n"
-                "• Most common conditions"
-            ),
-            "data": []
-        }
-    # -------------------------------
-    # Normalize question
-    # -------------------------------
-    question = correct_spelling(question)
-    question = normalize_time_question(question)
-    # Reset state once user asks a fresh question
-    LAST_PROMPT_TYPE = None
-    LAST_SUGGESTED_DATE = None
-    # -------------------------------
-    # Metadata queries
-    # -------------------------------
-    if any(x in question for x in ["what data", "what tables", "which data"]):
-        return {
-            "status": "ok",
-            "message": humanize(describe_schema()),
-            "data": []
-        }
-    # -------------------------------
-    # Unsupported questions
-    # -------------------------------
-    unsupported = get_unsupported_reason(question)
-    if unsupported:
-        return {
-            "status": "ok",
-            "message": (
-                f"{unsupported['reason']}\n\n"
-                f"{unsupported['suggestion']}\n\n"
-                "Example questions:\n"
-                "• How many patients were admitted last year?\n"
-                "• Total visits by month\n"
-                "• Patient count by gender"
-            ),
-            "data": []
-        }
-    # -------------------------------
-    # LLM → SQL
-    # -------------------------------
-    sql = call_llm(build_prompt(question))
-    if sql == "NOT_ANSWERABLE":
-        return {
-            "status": "ok",
-            "message": "I don’t have enough data to answer that.",
-            "data": []
-        }
-    sql = validate_sql(sanitize_sql(sql))
-    cols, rows = run_query(sql)
-    # -------------------------------
-    # No data (aggregate case)
-    # -------------------------------
-    if is_aggregate_only_query(sql) and not has_underlying_data(sql):
-        LAST_PROMPT_TYPE = "NO_DATA"
-        LAST_SUGGESTED_DATE = get_latest_data_date()
-        return {
-            "status": "ok",
-            "message": friendly("No data is available for that time period."),
-            "note": f"Available data is only up to {LAST_SUGGESTED_DATE}.",
-            "data": [],
-            "sql": None
-        }
-    # -------------------------------
-    # Empty result set
-    # -------------------------------
-    if not rows:
-        LAST_PROMPT_TYPE = "NO_DATA"
-        LAST_SUGGESTED_DATE = get_latest_data_date()
-        return {
-            "status": "ok",
-            "message": friendly("No records found."),
-            "note": f"Available data is only up to {LAST_SUGGESTED_DATE}.",
-            "data": []
-        }
-    # -------------------------------
-    # Successful response
-    # -------------------------------
-    return {
-        "status": "ok",
-        "sql": sql,
-        "columns": cols,
-        "data": rows
-    }

+import os
+import sqlite3
+from openai import OpenAI
+from difflib import get_close_matches
+from datetime import datetime
+# =========================
+# SETUP
+# =========================
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+conn = sqlite3.connect("hospital.db", check_same_thread=False)
+# =========================
+# CONVERSATION STATE
+# =========================
+LAST_PROMPT_TYPE = None
+LAST_SUGGESTED_DATE = None
+# =========================
+# HUMAN RESPONSE HELPERS
+# =========================
+def humanize(text):
+    return f"Sure \n\n{text}"
+def friendly(text):
+    return f"{text}\n\nIf you want, I can help you explore this further "
+def is_confirmation(text):
+    return text.strip().lower() in ["yes", "yep", "yeah", "ok", "okay", "sure"]
+def is_why_question(text):
+    return text.strip().lower().startswith("why")
+# =========================
+# SPELL CORRECTION
+# =========================
+KNOWN_TERMS = [
+    "patient", "patients", "condition", "conditions",
+    "encounter", "encounters", "visit", "visits",
+    "medication", "medications",
+    "admitted", "admission",
+    "year", "month", "last", "recent", "today"
+]
+def correct_spelling(q):
+    words = q.split()
+    fixed = []
+    for w in words:
+        clean = w.lower().strip(",.?")
+        match = get_close_matches(clean, KNOWN_TERMS, n=1, cutoff=0.8)
+        fixed.append(match[0] if match else w)
+    return " ".join(fixed)
+# =========================
+# SCHEMA
+# =========================
+from functools import lru_cache
+@lru_cache(maxsize=1)
+def load_ai_schema():
+    cur = conn.cursor()
+    schema = {}
+    tables = cur.execute("""
+        SELECT table_name, description
+        FROM ai_tables
+        WHERE ai_enabled = 1
+    """).fetchall()
+    for table, desc in tables:
+        cols = cur.execute("""
+            SELECT column_name, description
+            FROM ai_columns
+            WHERE table_name = ? AND ai_allowed = 1
+        """, (table,)).fetchall()
+        schema[table] = {
+            "description": desc,
+            "columns": cols
+        }
+    return schema
+# =========================
+# TABLE MATCHING (CORE LOGIC)
+# =========================
+def extract_relevant_tables(question):
+    schema = load_ai_schema()
+    q = question.lower()
+    matched = []
+    for table, meta in schema.items():
+        # match table name
+        if table.lower() in q:
+            matched.append(table)
+            continue
+        # match column names
+        for col, _ in meta["columns"]:
+            if col.lower() in q:
+                matched.append(table)
+                break
+    return list(set(matched))[:5]
+# =========================
+# HUMAN SCHEMA DESCRIPTION
+# =========================
+def describe_schema():
+    schema = load_ai_schema()
+    response = "Here’s the data I currently have access to:\n\n"
+    for table, meta in schema.items():
+        response += f"• **{table.capitalize()}** — {meta['description']}\n"
+        for col, desc in meta["columns"]:
+            response += f"  - {col}: {desc}\n"
+        response += "\n"
+    response += (
+        "You can ask things like:\n"
+        "• How many patients are there?\n"
+        "�� Patient count by gender\n"
+        "• Admissions by year\n\n"
+        "Just tell me what you want to explore "
+    )
+    if not schema:
+        return "No AI-enabled tables are configured."
+    return response
+# =========================
+# TIME HANDLING
+# =========================
+def get_latest_data_date():
+    cur = conn.cursor()
+    r = cur.execute("SELECT MAX(start_date) FROM encounters").fetchone()
+    return r[0]
+def normalize_time_question(q):
+    latest = get_latest_data_date()
+    if not latest:
+        return q
+    if "today" in q:
+        return q.replace("today", f"on {latest[:10]}")
+    if "yesterday" in q:
+        return q.replace("yesterday", f"on {latest[:10]}")
+    return q
+# =========================
+# UNSUPPORTED QUESTIONS
+# =========================
+def is_question_supported(question):
+    q = question.lower()
+    if any(k in q for k in [
+        "count", "total", "average", "sum",
+        "how many", "number of", "trend"
+    ]):
+        return True
+    schema = load_ai_schema()
+    for table, meta in schema.items():
+        if table in q:
+            return True
+        for col, _ in meta["columns"]:
+            if col in q:
+                return True
+    return False
+# =========================
+# SQL GENERATION
+# =========================
+def build_prompt(question):
+    matched = extract_relevant_tables(question)
+    if matched:
+        schema = {t: load_ai_schema()[t] for t in matched}
+    else:
+        schema = load_ai_schema()  # fallback if nothing matched
+    prompt = """
+You are a hospital SQL assistant.
+Rules:
+- Use only SELECT
+- SQLite syntax
+- Use only listed tables/columns
+- Return ONLY SQL or NOT_ANSWERABLE
+"""
+    for table, meta in schema.items():
+        prompt += f"\nTable: {table}\n"
+        for col, desc in meta["columns"]:
+            prompt += f"- {col}: {desc}\n"
+    prompt += f"\nQuestion: {question}\n"
+    return prompt
+def call_llm(prompt):
+    res = client.chat.completions.create(
+        model="gpt-4.1-mini",
+        messages=[
+            {"role": "system", "content": "Return only SQL or NOT_ANSWERABLE"},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0
+    )
+    return res.choices[0].message.content.strip()
+# =========================
+# SQL SAFETY
+# =========================
+def sanitize_sql(sql):
+    sql = sql.replace("```", "").replace("sql", "").strip()
+    sql = sql.split(";")[0]
+    return sql.replace("\n", " ").strip()
+def validate_sql(sql):
+    if not sql.lower().startswith("select"):
+        raise Exception("Only SELECT allowed")
+    return sql
+def run_query(sql):
+    cur = conn.cursor()
+    rows = cur.execute(sql).fetchall()
+    cols = [c[0] for c in cur.description]
+    return cols, rows
+# =========================
+# AGGREGATE SAFETY
+# =========================
+def is_aggregate_only_query(sql):
+    s = sql.lower()
+    return ("count(" in s or "sum(" in s or "avg(" in s) and "group by" not in s
+def has_underlying_data(sql):
+    base = sql.lower()
+    if "from" not in base:
+        return False
+    base = base.split("from", 1)[1]
+    test_sql = "SELECT 1 FROM " + base.split("group by")[0] + " LIMIT 1"
+    cur = conn.cursor()
+    return cur.execute(test_sql).fetchone() is not None
+# =========================
+# PATIENT SUMMARY
+# =========================
+def build_table_summary(table_name):
+    cur = conn.cursor()
+    # Total rows
+    total = cur.execute(
+        f"SELECT COUNT(*) FROM {table_name}"
+    ).fetchone()[0]
+    # Get column info
+    columns = cur.execute(
+        f"PRAGMA table_info({table_name})"
+    ).fetchall()
+    summary = f"Here’s a summary of **{table_name}**:\n\n"
+    summary += f"• Total records: {total}\n"
+    # Try to summarize categorical columns
+    for col in columns:
+        col_name = col[1]
+        col_type = col[2].lower()
+        if col_type in ("text", "varchar"):
+            try:
+                rows = cur.execute(
+                    f"""
+                    SELECT {col_name}, COUNT(*)
+                    FROM {table_name}
+                    GROUP BY {col_name}
+                    ORDER BY COUNT(*) DESC
+                    LIMIT 5
+                    """
+                ).fetchall()
+                if rows:
+                    summary += f"\n• {col_name.capitalize()} breakdown:\n"
+                    for val, count in rows:
+                        summary += f"  - {val}: {count}\n"
+            except:
+                pass  # ignore columns that can't be grouped
+    summary += "\nYou can ask more detailed questions about this data."
+    return summary
+# =========================
+# MAIN ENGINE
+# =========================
+def process_question(question):
+    global LAST_PROMPT_TYPE, LAST_SUGGESTED_DATE
+    q = question.strip().lower()
+    # ----------------------------------
+    # Normalize first
+    # ----------------------------------
+    question = correct_spelling(question)
+    question = normalize_time_question(question)
+    LAST_PROMPT_TYPE = None
+    LAST_SUGGESTED_DATE = None
+    # ----------------------------------
+    # Handle "data updated till"
+    # ----------------------------------
+    if any(x in q for x in ["updated", "upto", "up to", "latest data"]):
+        return {
+            "status": "ok",
+            "message": f"Data is available up to {get_latest_data_date()}",
+            "data": []
+        }
+    # ----------------------------------
+    # Extract relevant tables
+    # ----------------------------------
+    matched_tables = extract_relevant_tables(question)
+    # ----------------------------------
+    # SUMMARY ONLY IF USER ASKS FOR IT
+    # ----------------------------------
+    if (
+    len(matched_tables) == 1
+    and any(k in q for k in ["summary", "overview", "describe"])
+    and not any(k in q for k in ["count", "total", "how many", "average"])
+):
+        return {
+        "status": "ok",
+        "message": build_table_summary(matched_tables[0]),
+        "data": []
+    }
+    if len(matched_tables) > 1:
+        return {
+        "status": "ok",
+        "message": (
+            "Your question matches multiple datasets:\n"
+            + "\n".join(f"- {t}" for t in matched_tables)
+            + "\n\nPlease be more specific."
+        ),
+        "data": []
+    }
+    # ----------------------------------
+    # Metadata discovery
+    # ----------------------------------
+    if any(x in q for x in ["what data", "what tables", "which data"]):
+        return {
+            "status": "ok",
+            "message": humanize(describe_schema()),
+            "data": []
+        }
+    # ----------------------------------
+    # Unsupported question check
+    # ----------------------------------
+    if not is_question_supported(question):
+        return {
+            "status": "ok",
+            "message": (
+                "That information isn’t available in the system.\n\n"
+                "You can ask about:\n"
+                "• Patients\n"
+                "• Visits\n"
+                "• Conditions\n"
+                "• Medications"
+            ),
+            "data": []
+        }
+    # ----------------------------------
+    # Generate SQL
+    # ----------------------------------
+    sql = call_llm(build_prompt(question))
+    if sql == "NOT_ANSWERABLE":
+        return {
+            "status": "ok",
+            "message": "I don’t have enough data to answer that.",
+            "data": []
+        }
+    sql = validate_sql(sanitize_sql(sql))
+    cols, rows = run_query(sql)
+    # ----------------------------------
+    # No data handling
+    # ----------------------------------
+    if is_aggregate_only_query(sql) and not has_underlying_data(sql):
+        LAST_PROMPT_TYPE = "NO_DATA"
+        LAST_SUGGESTED_DATE = get_latest_data_date()
+        return {
+            "status": "ok",
+            "message": friendly("No data is available for that time period."),
+            "note": f"Available data is only up to {LAST_SUGGESTED_DATE}.",
+            "data": []
+        }
+    if not rows:
+        LAST_PROMPT_TYPE = "NO_DATA"
+        LAST_SUGGESTED_DATE = get_latest_data_date()
+        return {
+            "status": "ok",
+            "message": friendly("No records found."),
+            "note": f"Available data is only up to {LAST_SUGGESTED_DATE}.",
+            "data": []
+        }
+    # ----------------------------------
+    # Success
+    # ----------------------------------
+    return {
+        "status": "ok",
+        "sql": sql,
+        "columns": cols,
+        "data": rows
+    }