Spaces:

bhavika24
/

Text_to_sql

Sleeping

App Files Files Community

bhavika24 commited on 13 days ago

Commit

4897d3e

verified ·

1 Parent(s): eb31619

Upload engine.py

Browse files

Files changed (1) hide show

engine.py +84 -47

engine.py CHANGED Viewed

@@ -47,6 +47,12 @@ KNOWN_TERMS = [
     "admitted", "admission",
     "year", "month", "last", "recent", "today"
 ]
 def correct_spelling(q):
     words = q.split()
@@ -93,27 +99,42 @@ def load_ai_schema():
 # TABLE MATCHING (CORE LOGIC)
 # =========================
-def extract_relevant_tables(question, max_tables=5):
     schema = load_ai_schema()
     q = question.lower()
     tokens = set(q.replace("?", "").replace(",", "").split())
     matched = []
     for table, meta in schema.items():
         score = 0
         table_l = table.lower()
-        # 1️⃣ Table name match (strong signal)
         if table_l in q:
-            score += 5
-        # 2️⃣ Description match
-        if meta.get("description"):
-            desc_words = meta["description"].lower().split()
-            score += len(tokens & set(desc_words)) * 2
-        # 3️⃣ Column name matches
         for col, _ in meta["columns"]:
             col_l = col.lower()
             if col_l in q:
@@ -121,46 +142,52 @@ def extract_relevant_tables(question, max_tables=5):
             elif any(tok in col_l for tok in tokens):
                 score += 1
-        # 4️⃣ Weak semantic hints
-        semantic_map = {
-            "patient": ["patient", "patients"],
-            "visit": ["visit", "encounter"],
-            "medication": ["drug", "medicine"],
-            "admission": ["admit", "admission"],
-            "date": ["date", "year", "month"]
-        }
-        for key, words in semantic_map.items():
-            if any(w in q for w in words) and key in table_l:
-                score += 2
-        if score > 0:
             matched.append((table, score))
     # Sort by relevance
     matched.sort(key=lambda x: x[1], reverse=True)
-    # Return top N tables
     return [t[0] for t in matched[:max_tables]]
 # =========================
 # HUMAN SCHEMA DESCRIPTION
 # =========================
-def describe_schema():
     schema = load_ai_schema()
-    response = "Here’s the data I currently have access to:\n\n"
-    for table, meta in schema.items():
         response += f"• **{table.capitalize()}** — {meta['description']}\n"
-        for col, desc in meta["columns"]:
             response += f"  - {col}: {desc}\n"
         response += "\n"
     response += (
         "You can ask things like:\n"
         "• How many patients are there?\n"
@@ -168,9 +195,6 @@ def describe_schema():
         "• Admissions by year\n\n"
         "Just tell me what you want to explore "
     )
     return response
@@ -204,21 +228,21 @@ def is_question_supported(question):
     q = question.lower()
     tokens = set(q.replace("?", "").replace(",", "").split())
-    # 1️⃣ Allow analytical intent even if table not mentioned
     analytic_keywords = {
         "count", "total", "average", "avg", "sum",
-        "how many", "number of", "trend", "trendline",
-        "increase", "decrease", "compare"
     }
     if any(k in q for k in analytic_keywords):
         return True
-    # 2️⃣ Schema-based scoring
     schema = load_ai_schema()
-    score = 0
     for table, meta in schema.items():
         table_l = table.lower()
         # Table name match
@@ -235,11 +259,15 @@ def is_question_supported(question):
         # Description match
         if meta.get("description"):
-            desc_tokens = meta["description"].lower().split()
-            score += len(tokens & set(desc_tokens))
-    # 3️⃣ Threshold — prevents random questions
-    return score >= 2
 # =========================
@@ -252,7 +280,11 @@ def build_prompt(question):
     if matched:
         schema = {t: load_ai_schema()[t] for t in matched}
     else:
-        schema = load_ai_schema()  # fallback if nothing matched
     prompt = """
 You are a hospital SQL assistant.
@@ -323,9 +355,6 @@ def has_underlying_data(sql):
     cur = conn.cursor()
     return cur.execute(test_sql).fetchone() is not None
 # =========================
 # PATIENT SUMMARY
 # =========================
@@ -466,12 +495,20 @@ def process_question(question):
     # ----------------------------------
     # Generate SQL
     # ----------------------------------
-    sql = call_llm(build_prompt(question))
     if sql == "NOT_ANSWERABLE":
         return {
             "status": "ok",
-            "message": "I don’t have enough data to answer that.",
             "data": []
         }

     "admitted", "admission",
     "year", "month", "last", "recent", "today"
 ]
+DOMAIN_ALIASES = {
+    "consultant": ["provider", "encounter"],
+    "doctor": ["provider"],
+    "appointment": ["encounter"],
+    "visit": ["encounter"],
+}
 def correct_spelling(q):
     words = q.split()
 # TABLE MATCHING (CORE LOGIC)
 # =========================
+def extract_relevant_tables(question, max_tables=4):
     schema = load_ai_schema()
     q = question.lower()
     tokens = set(q.replace("?", "").replace(",", "").split())
     matched = []
+    # Lightweight intent hints (NO hard dependency)
+    DOMAIN_HINTS = {
+        "consultant": ["encounters"],
+        "doctor": ["encounters"],
+        "visit": ["encounters"],
+        "appointment": ["encounters"],
+        "patient": ["patients"],
+        "medication": ["medications"],
+        "drug": ["medications"],
+        "condition": ["conditions"],
+        "diagnosis": ["conditions"]
+    }
+    # Early exit threshold - if we find a perfect match, we can stop early
+    VERY_HIGH_SCORE = 10
     for table, meta in schema.items():
         score = 0
         table_l = table.lower()
+        # 1️⃣ Strong signal: table name (exact match is very high confidence)
         if table_l in q:
+            score += 6
+            # Early exit optimization: if exact table match found, prioritize it
+            if score >= VERY_HIGH_SCORE:
+                matched.append((table, score))
+                continue
+        # 2️⃣ Column relevance
         for col, _ in meta["columns"]:
             col_l = col.lower()
             if col_l in q:
             elif any(tok in col_l for tok in tokens):
                 score += 1
+        # 3️⃣ Description relevance
+        if meta.get("description"):
+            desc_tokens = set(meta["description"].lower().split())
+            score += len(tokens & desc_tokens)
+        # 4️⃣ Semantic intent mapping (important)
+        for intent, tables in DOMAIN_HINTS.items():
+            if intent in q and table_l in tables:
+                score += 5
+        # 5️⃣ Only add if meets minimum threshold (prevents low-quality matches)
+        if score >= 3:
             matched.append((table, score))
     # Sort by relevance
     matched.sort(key=lambda x: x[1], reverse=True)
     return [t[0] for t in matched[:max_tables]]
 # =========================
 # HUMAN SCHEMA DESCRIPTION
 # =========================
+def describe_schema(max_tables=10):
     schema = load_ai_schema()
+    total_tables = len(schema)
+    response = f"Here's the data I currently have access to ({total_tables} tables):\n\n"
+    # Show only top N tables to avoid overwhelming output
+    shown_tables = list(schema.items())[:max_tables]
+    for table, meta in shown_tables:
         response += f"• **{table.capitalize()}** — {meta['description']}\n"
+        # Show only first 5 columns per table
+        for col, desc in list(meta["columns"])[:5]:
             response += f"  - {col}: {desc}\n"
+        if len(meta["columns"]) > 5:
+            response += f"  ... and {len(meta['columns']) - 5} more columns\n"
         response += "\n"
+    if total_tables > max_tables:
+        response += f"\n... and {total_tables - max_tables} more tables.\n"
+        response += "Ask about a specific table to see its details.\n\n"
     response += (
         "You can ask things like:\n"
         "• How many patients are there?\n"
         "• Admissions by year\n\n"
         "Just tell me what you want to explore "
     )
     return response
     q = question.lower()
     tokens = set(q.replace("?", "").replace(",", "").split())
+    # 1️⃣ Allow analytical intent even without table names
     analytic_keywords = {
         "count", "total", "average", "avg", "sum",
+        "how many", "number of", "trend",
+        "increase", "decrease", "compare", "more than", "less than"
     }
     if any(k in q for k in analytic_keywords):
         return True
+    # 2️⃣ Check schema relevance (table-by-table)
     schema = load_ai_schema()
     for table, meta in schema.items():
+        score = 0
         table_l = table.lower()
         # Table name match
         # Description match
         if meta.get("description"):
+            desc_tokens = set(meta["description"].lower().split())
+            score += len(tokens & desc_tokens)
+        # ✅ If any table is relevant enough → supported
+        if score >= 2:
+            return True
+    return False
 # =========================
     if matched:
         schema = {t: load_ai_schema()[t] for t in matched}
     else:
+        # 🚫 Don't send all 100+ tables! Return a helpful error instead
+        raise ValueError(
+            "I couldn't find any relevant tables for your question. "
+            "Please try mentioning a specific table name or use 'what data' to see available tables."
+        )
     prompt = """
 You are a hospital SQL assistant.
     cur = conn.cursor()
     return cur.execute(test_sql).fetchone() is not None
 # =========================
 # PATIENT SUMMARY
 # =========================
     # ----------------------------------
     # Generate SQL
     # ----------------------------------
+    try:
+        sql = call_llm(build_prompt(question))
+    except ValueError as e:
+        # Handle case where no relevant tables found
+        return {
+            "status": "ok",
+            "message": str(e),
+            "data": []
+        }
     if sql == "NOT_ANSWERABLE":
         return {
             "status": "ok",
+            "message": "I don't have enough data to answer that.",
             "data": []
         }