Spaces:

bhavika24
/

Text_to_sql

Sleeping

App Files Files Community

bhavika24 commited on Jan 20

Commit

25a0c35

verified ·

1 Parent(s): 52a5931

Upload engine.py

Browse files

Files changed (1) hide show

engine.py +87 -176

engine.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import sqlite3
 from openai import OpenAI
 from difflib import get_close_matches
 # =========================
@@ -13,40 +14,59 @@ conn = sqlite3.connect("hospital.db", check_same_thread=False)
 # =========================
-# Known Terms for Spell Correction
 # =========================
 KNOWN_TERMS = [
-    "patient", "patients", "condition", "conditions", "diagnosis", "encounter", "encounters",
-    "visit", "visits", "observation", "observations", "lab", "labs", "test", "tests",
-    "medication", "medications", "drug", "drugs", "prescription", "prescriptions",
-    "diabetes", "hypertension", "asthma", "cancer", "admitted", "admission"
 ]
 def correct_spelling(question: str) -> str:
     words = question.split()
-    corrected_words = []
     for word in words:
-        clean_word = word.lower().strip(",.?")
-        matches = get_close_matches(clean_word, KNOWN_TERMS, n=1, cutoff=0.8)
-        if matches:
-            corrected_words.append(matches[0])
-        else:
-            corrected_words.append(word)
-    return " ".join(corrected_words)
 # =========================
-# Metadata Loader
 # =========================
 def load_ai_schema():
     cur = conn.cursor()
     schema = {}
     tables = cur.execute("""
@@ -55,17 +75,14 @@ def load_ai_schema():
         WHERE ai_enabled = 1
     """).fetchall()
-    for table_name, desc in tables:
         cols = cur.execute("""
             SELECT column_name, description
             FROM ai_columns
             WHERE table_name = ? AND ai_allowed = 1
-        """, (table_name,)).fetchall()
-        schema[table_name] = {
-            "description": desc,
-            "columns": cols
-        }
     return schema
@@ -81,23 +98,15 @@ def build_prompt(question: str) -> str:
 You are a hospital data assistant.
 Rules:
-- Generate only SELECT SQL queries.
-- Use only the tables and columns provided.
-- Do not invent tables or columns.
-- This database is SQLite. Use SQLite-compatible date functions.
-- For recent days use: date('now', '-N day')
-- Use case-insensitive matching for text fields.
-- Prefer LIKE with wildcards for medical condition names.
-- Use COUNT, AVG, MIN, MAX, GROUP BY when the question asks for totals, averages, or comparisons.
-- If the question cannot be answered using the schema, return NOT_ANSWERABLE.
-- Do not explain the query.
-- Return only SQL or NOT_ANSWERABLE.
-Available schema:
 """
     for table, meta in schema.items():
-        prompt += f"\nTable: {table} - {meta['description']}\n"
         for col, desc in meta["columns"]:
             prompt += f"  - {col}: {desc}\n"
@@ -106,212 +115,114 @@ Available schema:
 # =========================
-# LLM Call
 # =========================
 def call_llm(prompt: str) -> str:
-    response = client.chat.completions.create(
         model="gpt-4.1-mini",
         messages=[
-            {"role": "system", "content": "You are a SQL generator. Return only SQL. No explanation."},
             {"role": "user", "content": prompt}
         ],
-        temperature=0.0
     )
-    return response.choices[0].message.content.strip()
 # =========================
-# SQL Generation
 # =========================
-def generate_sql(question: str) -> str:
-    prompt = build_prompt(question)
-    sql = call_llm(prompt)
-    return sql.strip()
-# =========================
-# SQL Cleaning & Validation
-# =========================
-def clean_sql(sql: str) -> str:
-    sql = sql.strip()
-    # Remove markdown code fences if present
-    if sql.startswith("```"):
-        parts = sql.split("```")
-        if len(parts) > 1:
-            sql = parts[1]
-    sql = sql.replace("sql\n", "").strip()
-    return sql
-def validate_sql(sql: str) -> str:
     sql = clean_sql(sql)
-    s = sql.lower()
-    forbidden = ["insert", "update", "delete", "drop", "alter", "truncate"]
-    if not s.startswith("select"):
-        raise Exception("Only SELECT queries allowed")
-    if any(f in s for f in forbidden):
-        raise Exception("Forbidden SQL operation detected")
     return sql
-# =========================
-# Query Runner
-# =========================
-def run_query(sql: str):
     cur = conn.cursor()
-    result = cur.execute(sql).fetchall()
-    columns = [desc[0] for desc in cur.description]
-    return columns, result
 # =========================
-# Guardrails
-# =========================
-def is_question_answerable(question):
-    keywords = [
-        "patient", "encounter", "condition", "observation",
-        "medication", "visit", "diagnosis", "lab", "vital", "admitted"
-    ]
-    q = question.lower()
-    if not any(k in q for k in keywords):
-        return False
-    return True
-# =========================
-# Time Awareness
 # =========================
 def get_latest_data_date():
-    sql = "SELECT MAX(start_date) FROM encounters;"
-    _, rows = run_query(sql)
     return rows[0][0]
-def check_time_relevance(question: str):
-    q = question.lower()
-    if any(word in q for word in ["last", "recent", "today", "this month", "this year"]):
-        latest = get_latest_data_date()
-        return f"Latest available data is from {latest}."
-    return None
-# =========================
-# Empty Result Interpreter
-# =========================
-def interpret_empty_result(question: str):
-    latest = get_latest_data_date()
-    return f"No results found. Available data is up to {latest}."
-# =========================
-# Data Range Check
-# =========================
-from datetime import datetime
-def is_request_out_of_data_range(question: str) -> bool:
     latest = get_latest_data_date()
     if not latest:
         return True
-    latest_date = datetime.fromisoformat(latest.replace("Z", "").split("T")[0])
     now = datetime.now()
     q = question.lower()
     if "this year" in q:
-        return latest_date.year < now.year
     if "last month" in q:
-        return (now.year, now.month - 1) > (latest_date.year, latest_date.month)
     if "recent" in q or "last 30" in q:
-        return (now - latest_date).days > 30
     return False
 # =========================
-# ORCHESTRATOR (Single Entry Point)
 # =========================
 def process_question(question: str):
-    # 0. Spell correction
     question = correct_spelling(question)
-    # 1. Guardrail
-    if not is_question_answerable(question):
-        return {
-            "status": "rejected",
-            "message": "This question is not supported by the available data."
-        }
-    # 2. Time relevance
-    # 2. Time relevance check
-    if is_request_out_of_data_range(question):
         latest = get_latest_data_date()
         return {
             "status": "ok",
-            "message": f"No data available for the requested time period. Latest available data is from {latest}.",
-            "data": [],
-            "sql": None,
-            "note": None
-            }
-    # 3. Generate SQL
     sql = generate_sql(question)
-    # 4. Validate SQL
     sql = validate_sql(sql)
-    # 5. Execute query
-    columns, rows = run_query(sql)
-    # 6. Handle empty result with data coverage awareness
-    if len(rows) == 0:
-        latest = get_latest_data_date()
-        q = question.lower()
-        if any(word in q for word in ["last", "recent", "this month", "this year"]):
-            return {
-                "status": "ok",
-                "sql": sql,
-                "message": f"No data available for the requested time period. Latest available data is from {latest}.",
-                "data": [],
-                "note": None
-            }
         return {
             "status": "ok",
-            "sql": sql,
-            "message": interpret_empty_result(question),
-            "data": [],
-            "note": time_note
         }
-    # 7. Normal response
     return {
         "status": "ok",
         "sql": sql,
-        "columns": columns,
-        "data": rows[:50],  # demo safety limit
-        "note": time_note
     }

 import sqlite3
 from openai import OpenAI
 from difflib import get_close_matches
+from datetime import datetime
 # =========================
 # =========================
+# Known Terms
 # =========================
 KNOWN_TERMS = [
+    "patient", "patients", "condition", "conditions", "diagnosis",
+    "encounter", "encounters", "visit", "visits",
+    "observation", "observations", "lab", "labs",
+    "medication", "medications",
+    "diabetes", "hypertension", "asthma",
+    "admitted", "admission"
 ]
 def correct_spelling(question: str) -> str:
     words = question.split()
+    fixed = []
     for word in words:
+        clean = word.lower().strip(",.?")
+        match = get_close_matches(clean, KNOWN_TERMS, n=1, cutoff=0.8)
+        fixed.append(match[0] if match else word)
+    return " ".join(fixed)
+# =========================
+# Unsupported Concept Check
+# =========================
+def get_unsupported_reason(question: str):
+    q = question.lower()
+    if any(w in q for w in ["consultant", "doctor", "doctors"]):
+        return "Consultant or doctor workload data is not available."
+    if any(w in q for w in ["specialization", "department"]):
+        return "Doctor specialization or department data is not available."
+    if any(w in q for w in ["insurance", "policy"]):
+        return "Insurance-related data is not available."
+    if any(w in q for w in ["staff", "employee", "hr"]):
+        return "HR or staff data is not available."
+    return None
 # =========================
+# Metadata
 # =========================
 def load_ai_schema():
     cur = conn.cursor()
     schema = {}
     tables = cur.execute("""
         WHERE ai_enabled = 1
     """).fetchall()
+    for table, desc in tables:
         cols = cur.execute("""
             SELECT column_name, description
             FROM ai_columns
             WHERE table_name = ? AND ai_allowed = 1
+        """, (table,)).fetchall()
+        schema[table] = {"description": desc, "columns": cols}
     return schema
 You are a hospital data assistant.
 Rules:
+- Only generate SELECT queries.
+- Use only provided tables and columns.
+- SQLite syntax only.
+- Use date('now', '-N day') for time filters.
+- Return ONLY SQL or NOT_ANSWERABLE.
 """
     for table, meta in schema.items():
+        prompt += f"\nTable: {table}\n"
         for col, desc in meta["columns"]:
             prompt += f"  - {col}: {desc}\n"
 # =========================
+# LLM
 # =========================
 def call_llm(prompt: str) -> str:
+    res = client.chat.completions.create(
         model="gpt-4.1-mini",
         messages=[
+            {"role": "system", "content": "Return only SQL."},
             {"role": "user", "content": prompt}
         ],
+        temperature=0
     )
+    return res.choices[0].message.content.strip()
 # =========================
+# SQL Helpers
 # =========================
+def clean_sql(sql):
+    return sql.replace("```", "").replace("sql\n", "").strip()
+def validate_sql(sql):
     sql = clean_sql(sql)
+    if not sql.lower().startswith("select"):
+        raise Exception("Invalid SQL")
     return sql
+def run_query(sql):
     cur = conn.cursor()
+    rows = cur.execute(sql).fetchall()
+    cols = [c[0] for c in cur.description]
+    return cols, rows
 # =========================
+# Time Logic
 # =========================
 def get_latest_data_date():
+    _, rows = run_query("SELECT MAX(start_date) FROM encounters;")
     return rows[0][0]
+def is_out_of_range(question: str):
     latest = get_latest_data_date()
     if not latest:
         return True
+    latest_dt = datetime.fromisoformat(latest.split("T")[0])
     now = datetime.now()
     q = question.lower()
     if "this year" in q:
+        return latest_dt.year < now.year
     if "last month" in q:
+        return (now.year, now.month - 1) > (latest_dt.year, latest_dt.month)
     if "recent" in q or "last 30" in q:
+        return (now - latest_dt).days > 30
     return False
 # =========================
+# MAIN ENTRY
 # =========================
 def process_question(question: str):
     question = correct_spelling(question)
+    # ❌ Unsupported concept
+    reason = get_unsupported_reason(question)
+    if reason:
+        return {"status": "rejected", "message": reason}
+    # ❌ Out-of-range data
+    if is_out_of_range(question):
         latest = get_latest_data_date()
         return {
             "status": "ok",
+            "message": "No data available for the requested time period.",
+            "note": f"Latest available data is from {latest}.",
+            "suggestion": f"Try asking about data from {latest[:4]}.",
+            "data": []
+        }
+    # Generate SQL
     sql = generate_sql(question)
     sql = validate_sql(sql)
+    # Execute
+    cols, rows = run_query(sql)
+    if not rows:
         return {
             "status": "ok",
+            "message": "No matching records found.",
+            "data": []
         }
     return {
         "status": "ok",
         "sql": sql,
+        "columns": cols,
+        "data": rows[:50]
     }