Spaces:

bhavika24
/

Text_to_sql

Sleeping

App Files Files Community

bhavika24 commited on Jan 23

Commit

83b6e91

verified ·

1 Parent(s): f3ebab8

Upload engine.py

Browse files

Files changed (1) hide show

engine.py +168 -442

engine.py CHANGED Viewed

@@ -1,518 +1,244 @@
 import os
-import re
 from openai import OpenAI
-from difflib import get_close_matches
 from datetime import datetime
-TRANSCRIPT = [] #memory log
-#store interaction in transcript
-def log_interaction(user_q, sql=None, result=None, error=None):
-    TRANSCRIPT.append({
-        "timestamp": datetime.utcnow().isoformat(),
-        "question": user_q,
-        "sql": sql,
-        "result_preview": result[:10] if isinstance(result, list) else result,
-        "error": error
-    })
 # =========================
-# SETUP
 # =========================
-# Validate API key
-api_key = os.getenv("OPENAI_API_KEY")
-if not api_key:
-    raise ValueError("OPENAI_API_KEY environment variable is not set")
-client = OpenAI(api_key=api_key)
 # =========================
-# CONVERSATION STATE
 # =========================
-LAST_PROMPT_TYPE = None
-LAST_SUGGESTED_DATE = None
-# =========================
-# HUMAN RESPONSE HELPERS
-# =========================
-def humanize(text):
-    return f"Sure \n\n{text}"
-def friendly(text):
-        return text
-def is_confirmation(text):
-    return text.strip().lower() in ["yes", "yep", "yeah", "ok", "okay", "sure"]
-def is_why_question(text):
-    return text.strip().lower().startswith("why")
-# =========================
-# SPELL CORRECTION
-# =========================
-KNOWN_TERMS = [
-    "patient", "patients",
-    "admission", "admissions",
-    "icu", "stay", "icustay",
-    "diagnosis", "procedure",
-    "medication", "lab",
-    "year", "month", "recent", "today"
-]
-def correct_spelling(q):
-    words = q.split()
-    fixed = []
-    for w in words:
-        clean = w.lower().strip(",.?")
-        match = get_close_matches(clean, KNOWN_TERMS, n=1, cutoff=0.8)
-        fixed.append(match[0] if match else clean)
-    return " ".join(fixed)
-# =========================
-# SCHEMA
-# =========================
-import json
-from functools import lru_cache
-def col_desc(desc):#extract description
-    """Safely extract column description from metadata."""
-    if isinstance(desc, dict):
-        return desc.get("description", "")
-    return str(desc)
-@lru_cache(maxsize=1)
-def load_ai_schema():
-    #load metadata
-    """Load schema from metadata JSON file with error handling."""
-    try:
-        with open("metadata.json", "r") as f:
-            schema = json.load(f)
-            if not isinstance(schema, dict):
-                raise ValueError("Invalid metadata format: expected a dictionary")
-            return schema
-    except FileNotFoundError:
-        raise FileNotFoundError("metadata.json file not found. Please create it with your table metadata.")
-    except json.JSONDecodeError as e:
-        raise ValueError(f"Invalid JSON in metadata.json: {str(e)}")
-    except Exception as e:
-        raise ValueError(f"Error loading metadata: {str(e)}")
 # =========================
-# TABLE MATCHING (CORE LOGIC)
 # =========================
-def extract_relevant_tables(question, max_tables=4):
-    schema = load_ai_schema()
-    q = question.lower()
-    tokens = set(q.replace("?", "").replace(",", "").split())
-    matched = []
-    # Lightweight intent hints - dynamically filter to only include tables that exist
-    # Map natural language terms to potential table names (check against schema)
-    all_tables = list(schema.keys())
-    table_names_lower = [t.lower() for t in all_tables]
-    DOMAIN_HINTS = {}
-    # Build hints only for tables that actually exist
-    hint_mappings = {
-    # Patients & visits
-    "patient": ["patients"],
-    "patients": ["patients"],
-    "admission": ["admissions"],
-    "admissions": ["admissions"],
-    "visit": ["admissions", "icustays"],
-    "visits": ["admissions", "icustays"],
-    # ICU
-    "icu": ["icustays", "chartevents"],
-    "stay": ["icustays"],
-    "stays": ["icustays"],
-    # Diagnoses / conditions
-    "diagnosis": ["diagnoses_icd"],
-    "diagnoses": ["diagnoses_icd"],
-    "condition": ["diagnoses_icd"],
-    "conditions": ["diagnoses_icd"],
-    # Procedures
-    "procedure": ["procedures_icd"],
-    "procedures": ["procedures_icd"],
-    # Medications
-    "medication": ["prescriptions", "emar", "pharmacy"],
-    "medications": ["prescriptions", "emar", "pharmacy"],
-    "drug": ["prescriptions"],
-    "drugs": ["prescriptions"],
-    # Labs & vitals
-    "lab": ["labevents"],
-    "labs": ["labevents"],
-    "vital": ["chartevents"],
-    "vitals": ["chartevents"],
-}
-    # Only include hints for tables that exist in the schema
-    for intent, possible_tables in hint_mappings.items():
-        matching_tables = [t for t in possible_tables if t in table_names_lower]
-        if matching_tables:
-            DOMAIN_HINTS[intent] = matching_tables
-    # Early exit threshold - if we find a perfect match, we can stop early
-    VERY_HIGH_SCORE = 10
-    for table, meta in schema.items():
-        score = 0
-        table_l = table.lower()
-        # 1️⃣ Strong signal: table name (exact match is very high confidence)
-        if table_l in q:
-            score += 6
-            # Early exit optimization: if exact table match found, prioritize it
-            if score >= VERY_HIGH_SCORE:
-                matched.append((table, score))
-                continue
-        # 2️⃣ Column relevance
-        for col, desc in meta["columns"].items():
-            desc_text = col_desc(desc)
-            desc_tokens = set(desc_text.lower().split())
-            col_l = col.lower()
-            if col_l in q:
-                score += 3
-            elif any(tok in col_l for tok in tokens):
-                score += 1
-        # 3️⃣ Description relevance (less weight to avoid false positives)
-        if meta.get("description"):
-            desc_tokens = set(col_desc(meta.get("description", "")).lower().split())
-            # Only count meaningful word matches, not common words
-            common_words = {"the", "is", "at", "which", "on", "for", "a", "an"}
-            meaningful_matches = tokens & desc_tokens - common_words
-            if meaningful_matches:
-                score += len(meaningful_matches) * 0.5  # Reduced weight
-        # 4️⃣ Semantic intent mapping (important - highest priority)
-        for intent, tables in DOMAIN_HINTS.items():
-            if intent in q and table_l in tables:
-                score += 5
-        # 5️⃣ Only add if meets minimum threshold (prevents low-quality matches)
-        # Use lower threshold for small schemas (more lenient)
-        # Increased threshold from 3 to 4 for better precision, but lower to 2 for small schemas
-        threshold = 2 if len(schema) <= 5 else 4
-        if score >= threshold:
-            matched.append((table, score))
-    # Sort by relevance
-    matched.sort(key=lambda x: x[1], reverse=True)
-    # If no matches but schema is very small, return all tables (with lower confidence)
-    if not matched and len(schema) <= 3:
-        return list(schema.keys())[:max_tables]
-    return [t[0] for t in matched[:max_tables]]
-# =========================
-# HUMAN SCHEMA DESCRIPTION
-# =========================
-def describe_schema(max_tables=10):#what data you have or which table exist
-    schema = load_ai_schema()
-    total_tables = len(schema)
-    response = f"Here's the data I currently have access to ({total_tables} tables):\n\n"
-    # Show only top N tables to avoid overwhelming output
-    shown_tables = list(schema.items())[:max_tables]
-    for table, meta in shown_tables:
-        response += f"• **{table.capitalize()}** — {meta['description']}\n"
-        # Show only first 5 columns per table
-        for col, desc in list(meta["columns"].items())[:5]:
-            response += f"  - {col}: {col_desc(desc)}\n"
-        if len(meta["columns"]) > 5:
-            response += f"  ... and {len(meta['columns']) - 5} more columns\n"
-        response += "\n"
-    if total_tables > max_tables:
-        response += f"\n... and {total_tables - max_tables} more tables.\n"
-        response += "Ask about a specific table to see its details.\n\n"
-    response += (
-        "You can ask things like:\n"
-        "• How many patients are there?\n"
-        "• Patient count by gender\n"
-        "• Admissions by year\n\n"
-        "Just tell me what you want to explore "
     )
-    return response
-# =========================
-# TIME HANDLING
-# =========================
-# =========================
-# SQL GENERATION
-# =========================
-def build_prompt(question):
-    matched = extract_relevant_tables(question)
-    full_schema = load_ai_schema()
-    if not matched:
-        available_tables = list(full_schema.keys())[:10]
-        tables_list = "\n".join(f"- {t}" for t in available_tables)
-        if len(full_schema) > 10:
-            tables_list += f"\n... and {len(full_schema) - 10} more tables"
         raise ValueError(
-            "I couldn't find any relevant tables for your question.\n\n"
-            f"Available tables:\n{tables_list}\n\n"
-            "Try mentioning a table name or ask: 'what data is available?'"
         )
-    schema = {t: full_schema[t] for t in matched}
-    IMPORTANT_COLS = {
-        "subject_id", "hadm_id", "stay_id",
-        "icustay_id", "itemid",
-        "charttime", "starttime", "endtime"
-    }
-    prompt = """
-You are an expert SQLite query generator.
-STRICT RULES:
-- Use ONLY the tables and columns listed below
-- NEVER invent table or column names
-- If the answer cannot be derived, return: NOT_ANSWERABLE
-- Do NOT explain the SQL
-- Do NOT wrap SQL in markdown
-- Use explicit JOIN conditions
-- Prefer COUNT(*) for totals
-- Use these joins only if columns from both tables are required.
-- patients.subject_id = admissions.subject_id
-- admissions.hadm_id = icustays.hadm_id
-- icustays.stay_id = chartevents.stay_id
-Schema:
-"""
-    for table, meta in schema.items():
-        prompt += f"\nTable: {table}\n"
-        for col, desc in meta["columns"].items():
-            text = f"{col} {col_desc(desc)}".lower()
-            # Keep columns relevant to question
-            if any(w in text for w in question.lower().split()):
-                prompt += f"- {col}\n"
-            # Always keep join / key columns
-            elif col in IMPORTANT_COLS or col.endswith("_id"):
-                prompt += f"- {col}\n"
-    # Optional: help LLM with joins (very helpful for MIMIC)
-    prompt += """
-Join hints:
-- patients.subject_id ↔ admissions.subject_id
-- admissions.hadm_id ↔ icustays.hadm_id
-- icustays.stay_id ↔ chartevents.stay_id
 """
-    prompt += f"\nQuestion: {question}\n"
-    prompt += "\nUse EXACT table and column names as shown above."
-    # Safety cap
-    if len(prompt) > 6000:
-        prompt = prompt[:6000] + "\n\n# Schema truncated for safety\n"
-    return prompt
-def call_llm(prompt):
-    """Call OpenAI API with error handling."""
-    try:
-        res = client.chat.completions.create(
-            model="gpt-4.1-mini",
-            messages=[
-                {"role": "system", "content": "Return only SQL or NOT_ANSWERABLE"},
-                {"role": "user", "content": prompt}
-            ],
-            temperature=0
-        )
-        if not res.choices or not res.choices[0].message.content:
-            raise ValueError("Empty response from OpenAI API")
-        return res.choices[0].message.content.strip()
-    except Exception as e:
-        raise ValueError(f"OpenAI API error: {str(e)}")
 # =========================
-# SQL SAFETY
 # =========================
-def sanitize_sql(sql):
-    # Remove code fence markers but preserve legitimate SQL
-    sql = sql.replace("```sql", "").replace("```", "").strip()
-    # Remove leading/trailing markdown code markers
-    if sql.startswith("sql"):
-        sql = sql[3:].strip()
-    sql = sql.split(";")[0]
-    return sql.replace("\n", " ").strip()
-def correct_table_names(sql):
-    schema = load_ai_schema()
-    valid_tables = {t.lower() for t in schema.keys()}
-    table_corrections = {
-        "visit": "admissions",
-        "visits": "admissions",
-        "provider": "caregiver",
-        "providers": "caregiver"
-    }
-    def replace_table(match):
-        keyword = match.group(1)
-        table = match.group(2)
-        table_l = table.lower()
-        if table_l in valid_tables:
-            return match.group(0)
-        if table_l in table_corrections:
-            corrected = table_corrections[table_l]
-            if corrected in valid_tables:
-                return f"{keyword} {corrected}"
-        return match.group(0)
-    pattern = re.compile(
-        r"\b(from|join)\s+([a-zA-Z_][a-zA-Z0-9_]*)",
-        re.IGNORECASE
-    )
-    return pattern.sub(replace_table, sql)
-def validate_sql(sql):
-    sql_l = sql.lower().strip()
-    # Must be SELECT
-    if not sql_l.startswith("select"):
-        raise ValueError("Only SELECT statements are allowed")
-    # Block dangerous keywords
-    forbidden = ["insert", "update", "delete", "drop", "alter", "truncate"]
-    if any(word in sql_l for word in forbidden):
-        raise ValueError("Unsafe SQL detected")
-    # Block multiple statements
-    if ";" in sql_l[:-1]:
-        raise ValueError("Multiple SQL statements are not allowed")
-    # JOIN must have ON
-    if " join " in sql_l and " on " not in sql_l:
-        raise ValueError("JOIN without ON condition is not allowed")
-    # Prevent SELECT *
-    if "select *" in sql_l:
-        raise ValueError("SELECT * is not allowed")
-    # Enforce LIMIT
-    if "limit" not in sql_l:
-        sql += " LIMIT 100"
-    return sql
-def explain_sql(sql):
-    return {
-        "type": "aggregation" if "count(" in sql else "selection",
-        "has_join": "join" in sql.lower(),
-        "has_filter": "where" in sql.lower()
-    }
-# =========================
-# PATIENT SUMMARY
-# =========================
-def validate_identifier(name):
-    """Validate that identifier is safe (only alphanumeric and underscores)."""
-    if not name or not isinstance(name, str):
-        return False
-    # Check for SQL injection attempts
-    forbidden = [";", "--", "/*", "*/", "'", '"', "`", "(", ")", " ", "\n", "\t"]
-    if any(char in name for char in forbidden):
-        return False
-    # Must start with letter or underscore, rest alphanumeric/underscore
-    return bool(re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name))
 # =========================
-# MAIN ENGINE
 # =========================
-def process_question(question):
-    question = correct_spelling(question)
-    # 1️⃣ Metadata requests
-    if any(x in question.lower() for x in ["what data", "what tables"]):
-        return {
-            "status": "ok",
-            "message": describe_schema()
-        }
-    # 2️⃣ Build LLM prompt
-    try:
-        prompt = build_prompt(question)
-    except Exception as e:
-        return {
-            "status": "error",
-            "message": str(e)
-        }
-    # 3️⃣ Generate SQL
-    try:
-        sql = call_llm(prompt)
-    except Exception as e:
-        return {
-            "status": "error",
-            "message": str(e)
-        }
-    if sql == "NOT_ANSWERABLE":
-        return {
-            "status": "ok",
-            "message": "I don't have enough data to answer that."
-        }
-    # 4️⃣ Sanitize & validate
-    try:
-        sql = sanitize_sql(sql)
-        sql = correct_table_names(sql)
-        sql = validate_sql(sql)
-        sql_info = explain_sql(sql)
-    except Exception as e:
-        return {
-            "status": "error",
-            "message": str(e)
-        }
-    # 5️⃣ Return SQL only (no execution)
     return {
-    "status": "ok",
-    "message": humanize(
-        "Here’s the SQL query I generated based on your question 😊"
-    ),
-    "sql": sql,
-    "sql_info": sql_info
-}

+import json
 import os
+from functools import lru_cache
 from openai import OpenAI
 from datetime import datetime
 # =========================
+# CONFIG
 # =========================
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 # =========================
+# METADATA LOADING
 # =========================
+@lru_cache(maxsize=1)
+def load_metadata():
+    with open("modules.json") as f:
+        modules = json.load(f)
+    with open("join_graph.json") as f:
+        joins = json.load(f)
+    with open("field_types.json") as f:
+        field_types = json.load(f)
+    with open("fields.json") as f:
+        fields = json.load(f)
+    return {
+        "modules": modules,
+        "joins": joins,
+        "field_types": field_types,
+        "fields": fields
+    }
+def resolve_operator(op, value):
+    mapping = {
+        "equals": "=",
+        "not_equals": "!=",
+        "greater_than": ">",
+        "less_than": "<",
+        "greater_or_equal": ">=",
+        "less_or_equal": "<=",
+        "contains": "LIKE",
+        "starts_with": "LIKE",
+        "ends_with": "LIKE",
+        "in": "IN",
+        "not_in": "NOT IN"
+    }
+    if op not in mapping:
+        raise ValueError(f"Unsupported operator: {op}")
+    sql_op = mapping[op]
+    if op == "contains":
+        return sql_op, f"'%{value}%'"
+    if op == "starts_with":
+        return sql_op, f"'{value}%'"
+    if op == "ends_with":
+        return sql_op, f"'%{value}'"
+    if op in ("in", "not_in"):
+        if not isinstance(value, list):
+            raise ValueError("IN operator requires list")
+        return sql_op, f"({','.join(map(repr, value))})"
+    return sql_op, f"'{value}'"
 # =========================
+# JOIN RESOLUTION
 # =========================
+def resolve_join_path(start_table, end_table):
+    joins = load_metadata()["joins"]
+    for path in joins.values():
+        if path["start_table"] == start_table and path["end_table"] == end_table:
+            return path["steps"]
+    raise ValueError(
+        f"No join path found from {start_table} to {end_table}"
     )
+def resolve_field(field_name, module):
+    meta = load_metadata()
+    fields = meta["fields"]
+    if field_name not in fields:
+        raise ValueError(f"Unknown field: {field_name}")
+    field = fields[field_name]
+    if field["module"] != module:
+        raise ValueError(
+            f"Field '{field_name}' does not belong to module '{module}'"
+        )
+    if "table" not in field or "column" not in field:
         raise ValueError(
+            f"Field '{field_name}' is missing table/column mapping"
         )
+    return field
+def build_join_sql(base_table, steps):
+    sql = []
+    prev_alias = base_table  # alias == table name
+    for step in steps:
+        alias = step["alias"]
+        sql.append(
+            f"{step['join_type'].upper()} JOIN {step['table']} {alias} "
+            f"ON {prev_alias}.{step['base_column']} = {alias}.{step['foreign_column']}"
+        )
+        prev_alias = alias
+    return "\n".join(sql)
+# =========================
+# INTENT PARSING (LLM)
+# =========================
+def parse_intent(question):
+    prompt = f"""
+You are a query planner.
+Extract:
+- module
+- filters (field, operator, value)
+- selected fields (list of fields)
+Return JSON only.
+Example:
+{{
+  "module": "employees",
+  "filters": [
+    {{ "field": "department", "operator": "equals", "value": "IT" }}
+  ]
+}}
+User question:
+{question}
 """
+    res = client.chat.completions.create(
+        model="gpt-4.1-mini",
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0
+    )
+    return json.loads(res.choices[0].message.content)
 # =========================
+# SQL GENERATOR
 # =========================
+def build_sql(plan):
+    meta = load_metadata()
+    module = plan["module"]
+    if module not in meta["modules"]:
+        raise ValueError(f"Unknown module: {module}")
+    base_table = meta["modules"][module]["base_table"]
+    joins = []
+    joined_tables = set()
+    where_clauses = []
+    for f in plan.get("filters", []):
+        field_name = f["field"]
+        operator = f["operator"]
+        value = f["value"]
+        # Resolve field metadata
+        field = resolve_field(field_name, module)
+        table = field["table"]
+        column = field["column"]
+        # Handle JOIN only once
+        if table != base_table and table not in joined_tables:
+            join_steps = resolve_join_path(base_table, table)
+            join_sql = build_join_sql(base_table, join_steps)
+            joins.append(join_sql)
+            joined_tables.add(table)
+        # Operator resolution
+        sql_op, sql_value = resolve_operator(operator, value)
+        where_clauses.append(
+            f"{table}.{column} {sql_op} {sql_value}"
+        )
+    # Final SQL
+    sql = f"""
+    SELECT {base_table}.*
+    FROM {base_table}
+    {' '.join(joins)}
+    WHERE {' AND '.join(where_clauses)}
+    LIMIT 100
+    """
+    return sql.strip()
+# =========================
+# VALIDATION
+# =========================
+def validate_sql(sql):
+    sql = sql.lower()
+    if not sql.startswith("select"):
+        raise ValueError("Only SELECT allowed")
+    forbidden = ["drop", "delete", "update", "insert", "truncate"]
+    if any(x in sql for x in forbidden):
+        raise ValueError("Unsafe SQL")
+    return sql
 # =========================
+# MAIN ENTRY POINT
 # =========================
+def run(question):
+    plan = parse_intent(question)
+    sql = build_sql(plan)
+    sql = validate_sql(sql)
     return {
+        "query_plan": plan,
+        "sql": sql
+    }