Spaces:

bhavika24
/

Text_to_sql

Sleeping

App Files Files Community

bhavika24 commited on Jan 22

Commit

de9f3fd

verified ·

1 Parent(s): cb559f9

Upload 3 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
engine.py +136 -62
metadata.json +49 -0
mimic_iv.db +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 hospital.db filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 hospital.db filter=lfs diff=lfs merge=lfs -text
+mimic_iv.db filter=lfs diff=lfs merge=lfs -text

engine.py CHANGED Viewed

@@ -13,7 +13,7 @@ api_key = os.getenv("OPENAI_API_KEY")
 if not api_key:
     raise ValueError("OPENAI_API_KEY environment variable is not set")
 client = OpenAI(api_key=api_key)
-conn = sqlite3.connect("mimic_iv_demo.db", check_same_thread=False)
 # =========================
@@ -54,20 +54,22 @@ def is_why_question(text):
 # =========================
 KNOWN_TERMS = [
-    "patient", "patients", "condition", "conditions",
-    "encounter", "encounters", "visit", "visits",
-    "medication", "medications",
-    "admitted", "admission",
-    "year", "month", "last", "recent", "today"
 ]
 def correct_spelling(q):
     words = q.split()
     fixed = []
     for w in words:
         clean = w.lower().strip(",.?")
         match = get_close_matches(clean, KNOWN_TERMS, n=1, cutoff=0.8)
-        fixed.append(match[0] if match else w)
     return " ".join(fixed)
@@ -77,20 +79,26 @@ def correct_spelling(q):
 # =========================
 import json
 from functools import lru_cache
 @lru_cache(maxsize=1)
 def load_ai_schema():
     """Load schema from metadata JSON file with error handling."""
     try:
-        with open("hospital_metadata.json", "r") as f:
             schema = json.load(f)
             if not isinstance(schema, dict):
                 raise ValueError("Invalid metadata format: expected a dictionary")
             return schema
     except FileNotFoundError:
-        raise FileNotFoundError("hospital_metadata.json file not found. Please create it with your table metadata.")
     except json.JSONDecodeError as e:
-        raise ValueError(f"Invalid JSON in hospital_metadata.json: {str(e)}")
     except Exception as e:
         raise ValueError(f"Error loading metadata: {str(e)}")
@@ -114,17 +122,16 @@ def extract_relevant_tables(question, max_tables=4):
     # Build hints only for tables that actually exist
     hint_mappings = {
-        "consultant": ["encounter", "encounters", "visit", "visits"],
-        "doctor": ["encounter", "encounters", "provider", "providers"],
-        "visit": ["encounter", "encounters", "visit", "visits"],
-        "visited": ["encounter", "encounters", "visit", "visits"],
-        "visits": ["encounter", "encounters", "visit", "visits"],
-        "appointment": ["encounter", "encounters", "appointment", "appointments"],
-        "patient": ["patient", "patients"],
-        "medication": ["medication", "medications", "drug", "drugs"],
-        "drug": ["medication", "medications", "drug", "drugs"],
-        "condition": ["condition", "conditions", "diagnosis", "diagnoses"],
-        "diagnosis": ["condition", "conditions", "diagnosis", "diagnoses"]
     }
     # Only include hints for tables that exist in the schema
@@ -150,6 +157,9 @@ def extract_relevant_tables(question, max_tables=4):
         # 2️⃣ Column relevance
         for col, desc in meta["columns"].items():
             col_l = col.lower()
             if col_l in q:
                 score += 3
@@ -158,7 +168,7 @@ def extract_relevant_tables(question, max_tables=4):
         # 3️⃣ Description relevance (less weight to avoid false positives)
         if meta.get("description"):
-            desc_tokens = set(meta["description"].lower().split())
             # Only count meaningful word matches, not common words
             common_words = {"the", "is", "at", "which", "on", "for", "a", "an"}
             meaningful_matches = tokens & desc_tokens - common_words
@@ -204,7 +214,8 @@ def describe_schema(max_tables=10):
         response += f"• **{table.capitalize()}** — {meta['description']}\n"
         # Show only first 5 columns per table
         for col, desc in list(meta["columns"].items())[:5]:
-            response += f"  - {col}: {desc}\n"
         if len(meta["columns"]) > 5:
             response += f"  ... and {len(meta['columns']) - 5} more columns\n"
         response += "\n"
@@ -306,7 +317,8 @@ def is_question_supported(question):
         # Description match
         if meta.get("description"):
-            desc_tokens = set(meta["description"].lower().split())
             score += len(tokens & desc_tokens)
         # ✅ If any table is relevant enough → supported
@@ -325,43 +337,80 @@ def build_prompt(question):
     matched = extract_relevant_tables(question)
     full_schema = load_ai_schema()
-    if matched:
-        schema = {t: full_schema[t] for t in matched}
-    else:
-        # 🚫 Don't send all 100+ tables! Return a helpful error with available tables
-        available_tables = list(full_schema.keys())[:10]  # Show first 10 tables
         tables_list = "\n".join(f"- {t}" for t in available_tables)
         if len(full_schema) > 10:
             tables_list += f"\n... and {len(full_schema) - 10} more tables"
         raise ValueError(
-            f"I couldn't find any relevant tables for your question.\n\n"
             f"Available tables:\n{tables_list}\n\n"
-            f"Please try mentioning a specific table name or use 'what data' to see all available tables."
         )
     prompt = """
-You are a hospital SQL assistant.
-Rules:
-- Use only SELECT
-- SQLite syntax
-- Use ONLY the exact table names listed below (do not create or infer table names)
-- Use only listed tables/columns
-- Return ONLY SQL or NOT_ANSWERABLE
-IMPORTANT: Use EXACTLY the table names provided in the list below. Do not pluralize, modify, or guess table names.
 """
     for table, meta in schema.items():
         prompt += f"\nTable: {table}\n"
         for col, desc in meta["columns"].items():
-            prompt += f"- {col}: {desc}\n"
     prompt += f"\nQuestion: {question}\n"
-    prompt += "\nRemember: Use EXACT table names from the list above. Do not pluralize or modify table names."
     return prompt
 def call_llm(prompt):
     """Call OpenAI API with error handling."""
     try:
@@ -393,31 +442,51 @@ def sanitize_sql(sql):
     return sql.replace("\n", " ").strip()
 def correct_table_names(sql):
-    """Fix common table name mistakes in generated SQL."""
     schema = load_ai_schema()
-    valid_tables = set(schema.keys())
-    sql_lower = sql.lower()
-    sql_corrected = sql
-    # Common table name mappings (case-insensitive replacement)
     table_corrections = {
-        "visits": "encounters",
-        "visit": "encounters",
-        "providers": "encounters",  # if this table doesn't exist
     }
-    # Check each correction
-    for wrong_name, correct_name in table_corrections.items():
-        # Only correct if the wrong table doesn't exist AND correct one does
-        if wrong_name.lower() not in valid_tables and correct_name.lower() in valid_tables:
-            # Use word boundaries to avoid partial replacements
-            pattern = r'\b' + re.escape(wrong_name) + r'\b'
-            sql_corrected = re.sub(pattern, correct_name, sql_corrected, flags=re.IGNORECASE)
-    return sql_corrected
 def validate_sql(sql):
     if not sql.lower().startswith("select"):
         raise ValueError("Only SELECT allowed")
     return sql
@@ -441,7 +510,12 @@ def run_query(sql):
 def is_aggregate_only_query(sql):
     s = sql.lower()
-    return ("count(" in s or "sum(" in s or "avg(" in s) and "group by" not in s
 def has_underlying_data(sql):
     """Check if underlying data exists for the SQL query."""

 if not api_key:
     raise ValueError("OPENAI_API_KEY environment variable is not set")
 client = OpenAI(api_key=api_key)
+conn = sqlite3.connect("mimic_iv.db", check_same_thread=False)
 # =========================
 # =========================
 KNOWN_TERMS = [
+    "patient", "patients",
+    "admission", "admissions",
+    "icu", "stay", "icustay",
+    "diagnosis", "procedure",
+    "medication", "lab",
+    "year", "month", "recent", "today"
 ]
 def correct_spelling(q):
     words = q.split()
     fixed = []
     for w in words:
         clean = w.lower().strip(",.?")
         match = get_close_matches(clean, KNOWN_TERMS, n=1, cutoff=0.8)
+        fixed.append(match[0] if match else clean)
     return " ".join(fixed)
 # =========================
 import json
 from functools import lru_cache
+def col_desc(desc):
+    """Safely extract column description from metadata."""
+    if isinstance(desc, dict):
+        return desc.get("description", "")
+    return str(desc)
 @lru_cache(maxsize=1)
 def load_ai_schema():
     """Load schema from metadata JSON file with error handling."""
     try:
+        with open("metadata.json", "r") as f:
             schema = json.load(f)
             if not isinstance(schema, dict):
                 raise ValueError("Invalid metadata format: expected a dictionary")
             return schema
     except FileNotFoundError:
+        raise FileNotFoundError("metadata.json file not found. Please create it with your table metadata.")
     except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON in metadata.json: {str(e)}")
     except Exception as e:
         raise ValueError(f"Error loading metadata: {str(e)}")
     # Build hints only for tables that actually exist
     hint_mappings = {
+        "patient": ["patients"],
+        "admission": ["admissions"],
+        "visit": ["admissions", "icustays"],
+        "icu": ["icustays", "chartevents"],
+        "diagnosis": ["diagnoses_icd"],
+        "procedure": ["procedures_icd"],
+        "medication": ["prescriptions", "emar", "pharmacy"],
+        "lab": ["labevents"],
+        "vital": ["chartevents"],
+        "stay": ["icustays"]
     }
     # Only include hints for tables that exist in the schema
         # 2️⃣ Column relevance
         for col, desc in meta["columns"].items():
+            desc_text = col_desc(desc)
+            desc_tokens = set(desc_text.lower().split())
             col_l = col.lower()
             if col_l in q:
                 score += 3
         # 3️⃣ Description relevance (less weight to avoid false positives)
         if meta.get("description"):
+            desc_tokens = set(col_desc(meta.get("description", "")).lower().split())
             # Only count meaningful word matches, not common words
             common_words = {"the", "is", "at", "which", "on", "for", "a", "an"}
             meaningful_matches = tokens & desc_tokens - common_words
         response += f"• **{table.capitalize()}** — {meta['description']}\n"
         # Show only first 5 columns per table
         for col, desc in list(meta["columns"].items())[:5]:
+            response += f"  - {col}: {col_desc(desc)}\n"
         if len(meta["columns"]) > 5:
             response += f"  ... and {len(meta['columns']) - 5} more columns\n"
         response += "\n"
         # Description match
         if meta.get("description"):
+            desc_tokens = set(col_desc(meta["description"]).lower().split())
             score += len(tokens & desc_tokens)
         # ✅ If any table is relevant enough → supported
     matched = extract_relevant_tables(question)
     full_schema = load_ai_schema()
+    if not matched:
+        available_tables = list(full_schema.keys())[:10]
         tables_list = "\n".join(f"- {t}" for t in available_tables)
         if len(full_schema) > 10:
             tables_list += f"\n... and {len(full_schema) - 10} more tables"
         raise ValueError(
+            "I couldn't find any relevant tables for your question.\n\n"
             f"Available tables:\n{tables_list}\n\n"
+            "Try mentioning a table name or ask: 'what data is available?'"
         )
+    schema = {t: full_schema[t] for t in matched}
+    IMPORTANT_COLS = {
+        "subject_id", "hadm_id", "stay_id",
+        "icustay_id", "itemid",
+        "charttime", "starttime", "endtime"
+    }
     prompt = """
+You are an expert SQLite query generator.
+STRICT RULES:
+- Use ONLY the tables and columns listed below
+- NEVER invent table or column names
+- If the answer cannot be derived, return: NOT_ANSWERABLE
+- Do NOT explain the SQL
+- Do NOT wrap SQL in markdown
+- Use explicit JOIN conditions
+- Prefer COUNT(*) for totals
+Always use these joins:
+- patients.subject_id = admissions.subject_id
+- admissions.hadm_id = icustays.hadm_id
+- icustays.stay_id = chartevents.stay_id
+Schema:
 """
     for table, meta in schema.items():
         prompt += f"\nTable: {table}\n"
         for col, desc in meta["columns"].items():
+            text = f"{col} {col_desc(desc)}".lower()
+            # Keep columns relevant to question
+            if any(w in text for w in question.lower().split()):
+                prompt += f"- {col}\n"
+            # Always keep join / key columns
+            elif col in IMPORTANT_COLS or col.endswith("_id"):
+                prompt += f"- {col}\n"
+    # Optional: help LLM with joins (very helpful for MIMIC)
+    prompt += """
+Join hints:
+- patients.subject_id ↔ admissions.subject_id
+- admissions.hadm_id ↔ icustays.hadm_id
+- icustays.stay_id ↔ chartevents.stay_id
+"""
     prompt += f"\nQuestion: {question}\n"
+    prompt += "\nUse EXACT table and column names as shown above."
+    # Safety cap
+    if len(prompt) > 6000:
+        prompt = prompt[:6000] + "\n\n# Schema truncated for safety\n"
     return prompt
 def call_llm(prompt):
     """Call OpenAI API with error handling."""
     try:
     return sql.replace("\n", " ").strip()
 def correct_table_names(sql):
     schema = load_ai_schema()
+    valid_tables = {t.lower() for t in schema.keys()}
     table_corrections = {
+        "visit": "admissions",
+        "visits": "admissions",
+        "provider": "caregiver",
+        "providers": "caregiver"
     }
+    def replace_table(match):
+        keyword = match.group(1)
+        table = match.group(2)
+        table_l = table.lower()
+        if table_l in valid_tables:
+            return match.group(0)
+        if table_l in table_corrections:
+            corrected = table_corrections[table_l]
+            if corrected in valid_tables:
+                return f"{keyword} {corrected}"
+        return match.group(0)
+    pattern = re.compile(
+        r"\b(from|join)\s+([a-zA-Z_][a-zA-Z0-9_]*)",
+        re.IGNORECASE
+    )
+    return pattern.sub(replace_table, sql)
 def validate_sql(sql):
+    if " join " in sql.lower() and " on " not in sql.lower():
+        raise ValueError("JOIN without ON condition is not allowed")
+    if ";" in sql.strip()[:-1]:
+        raise ValueError("Multiple SQL statements are not allowed")
+    FORBIDDEN = ["insert", "update", "delete", "drop", "alter"]
+    if any(k in sql.lower() for k in FORBIDDEN):
+        raise ValueError("Unsafe SQL detected")
     if not sql.lower().startswith("select"):
         raise ValueError("Only SELECT allowed")
     return sql
 def is_aggregate_only_query(sql):
     s = sql.lower()
+    return (
+    any(fn in s for fn in ["count(", "sum(", "avg("])
+    and "group by" not in s
+    and "over(" not in s
+)
 def has_underlying_data(sql):
     """Check if underlying data exists for the SQL query."""

metadata.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "patients": {
+    "description": "Patient demographic information",
+    "columns": {
+      "subject_id": "Unique patient identifier",
+      "gender": "Biological sex",
+      "anchor_age": "Approximate age",
+      "anchor_year": "Anchor year for age"
+    }
+  },
+  "admissions": {
+    "description": "Hospital admissions for patients",
+    "columns": {
+      "hadm_id": "Hospital admission ID",
+      "subject_id": "Patient ID",
+      "admittime": "Admission timestamp",
+      "dischtime": "Discharge timestamp",
+      "admission_type": "Emergency, elective, etc",
+      "admission_location": "Source of admission"
+    }
+  },
+  "icustays": {
+    "description": "ICU stay records",
+    "columns": {
+      "stay_id": "ICU stay identifier",
+      "hadm_id": "Hospital admission ID",
+      "subject_id": "Patient ID",
+      "intime": "ICU admission time",
+      "outtime": "ICU discharge time"
+    }
+  },
+  "chartevents": {
+    "description": "Time-series ICU measurements (vitals, labs)",
+    "columns": {
+      "stay_id": "ICU stay ID",
+      "itemid": "Measurement type",
+      "charttime": "Time of observation",
+      "valuenum": "Numeric value"
+    }
+  },
+  "diagnoses_icd": {
+    "description": "ICD diagnoses for admissions",
+    "columns": {
+      "hadm_id": "Hospital admission ID",
+      "icd_code": "Diagnosis code",
+      "icd_version": "ICD version"
+    }
+  }
+}

mimic_iv.db ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f199f1b69f2ec1b722011e5055797c0e11f139f1dc899e9076f9ecef6d7c1ce6
+size 128155648