Spaces:

bhavika24
/

Text_to_sql

Sleeping

App Files Files Community

bhavika24 commited on Jan 21

Commit

82d3e6d

verified ·

1 Parent(s): 4897d3e

Upload engine.py

Browse files

Files changed (1) hide show

engine.py +93 -38

engine.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import sqlite3
 from openai import OpenAI
 from difflib import get_close_matches
@@ -111,6 +112,8 @@ def extract_relevant_tables(question, max_tables=4):
         "consultant": ["encounters"],
         "doctor": ["encounters"],
         "visit": ["encounters"],
         "appointment": ["encounters"],
         "patient": ["patients"],
         "medication": ["medications"],
@@ -142,18 +145,23 @@ def extract_relevant_tables(question, max_tables=4):
             elif any(tok in col_l for tok in tokens):
                 score += 1
-        # 3️⃣ Description relevance
         if meta.get("description"):
             desc_tokens = set(meta["description"].lower().split())
-            score += len(tokens & desc_tokens)
-        # 4️⃣ Semantic intent mapping (important)
         for intent, tables in DOMAIN_HINTS.items():
             if intent in q and table_l in tables:
                 score += 5
         # 5️⃣ Only add if meets minimum threshold (prevents low-quality matches)
-        if score >= 3:
             matched.append((table, score))
     # Sort by relevance
@@ -292,8 +300,12 @@ You are a hospital SQL assistant.
 Rules:
 - Use only SELECT
 - SQLite syntax
 - Use only listed tables/columns
 - Return ONLY SQL or NOT_ANSWERABLE
 """
     for table, meta in schema.items():
@@ -302,6 +314,7 @@ Rules:
             prompt += f"- {col}: {desc}\n"
     prompt += f"\nQuestion: {question}\n"
     return prompt
@@ -325,6 +338,31 @@ def sanitize_sql(sql):
     sql = sql.split(";")[0]
     return sql.replace("\n", " ").strip()
 def validate_sql(sql):
     if not sql.lower().startswith("select"):
         raise Exception("Only SELECT allowed")
@@ -362,42 +400,47 @@ def has_underlying_data(sql):
 def build_table_summary(table_name):
     cur = conn.cursor()
-    # Total rows
     total = cur.execute(
         f"SELECT COUNT(*) FROM {table_name}"
     ).fetchone()[0]
-    # Get column info
-    columns = cur.execute(
-        f"PRAGMA table_info({table_name})"
-    ).fetchall()
-    summary = f"Here’s a summary of **{table_name}**:\n\n"
     summary += f"• Total records: {total}\n"
-    # Try to summarize categorical columns
-    for col in columns:
-        col_name = col[1]
-        col_type = col[2].lower()
-        if col_type in ("text", "varchar"):
-            try:
-                rows = cur.execute(
-                    f"""
-                    SELECT {col_name}, COUNT(*)
-                    FROM {table_name}
-                    GROUP BY {col_name}
-                    ORDER BY COUNT(*) DESC
-                    LIMIT 5
-                    """
-                ).fetchall()
-                if rows:
-                    summary += f"\n• {col_name.capitalize()} breakdown:\n"
-                    for val, count in rows:
-                        summary += f"  - {val}: {count}\n"
-            except:
-                pass  # ignore columns that can't be grouped
     summary += "\nYou can ask more detailed questions about this data."
@@ -453,13 +496,22 @@ def process_question(question):
         "message": build_table_summary(matched_tables[0]),
         "data": []
     }
-    if len(matched_tables) > 1:
         return {
         "status": "ok",
         "message": (
-            "Your question matches multiple datasets:\n"
-            + "\n".join(f"- {t}" for t in matched_tables)
-            + "\n\nPlease be more specific."
         ),
         "data": []
     }
@@ -512,7 +564,10 @@ def process_question(question):
             "data": []
         }
-    sql = validate_sql(sanitize_sql(sql))
     cols, rows = run_query(sql)
     # ----------------------------------

 import os
+import re
 import sqlite3
 from openai import OpenAI
 from difflib import get_close_matches
         "consultant": ["encounters"],
         "doctor": ["encounters"],
         "visit": ["encounters"],
+        "visited": ["encounters"],  # Handle past tense
+        "visits": ["encounters"],   # Handle plural
         "appointment": ["encounters"],
         "patient": ["patients"],
         "medication": ["medications"],
             elif any(tok in col_l for tok in tokens):
                 score += 1
+        # 3️⃣ Description relevance (less weight to avoid false positives)
         if meta.get("description"):
             desc_tokens = set(meta["description"].lower().split())
+            # Only count meaningful word matches, not common words
+            common_words = {"the", "is", "at", "which", "on", "for", "a", "an"}
+            meaningful_matches = tokens & desc_tokens - common_words
+            if meaningful_matches:
+                score += len(meaningful_matches) * 0.5  # Reduced weight
+        # 4️⃣ Semantic intent mapping (important - highest priority)
         for intent, tables in DOMAIN_HINTS.items():
             if intent in q and table_l in tables:
                 score += 5
         # 5️⃣ Only add if meets minimum threshold (prevents low-quality matches)
+        # Increased threshold from 3 to 4 for better precision
+        if score >= 4:
             matched.append((table, score))
     # Sort by relevance
 Rules:
 - Use only SELECT
 - SQLite syntax
+- Use ONLY the exact table names listed below (do not create or infer table names)
 - Use only listed tables/columns
 - Return ONLY SQL or NOT_ANSWERABLE
+IMPORTANT: If the question mentions "visit", "visited", or "visits", use the table name "encounters" (NOT "visits" or "visit").
+If the question mentions "consultant" or "doctor", use the table name "encounters".
 """
     for table, meta in schema.items():
             prompt += f"- {col}: {desc}\n"
     prompt += f"\nQuestion: {question}\n"
+    prompt += "\nRemember: Use EXACT table names from the list above. Do not pluralize or modify table names."
     return prompt
     sql = sql.split(";")[0]
     return sql.replace("\n", " ").strip()
+def correct_table_names(sql):
+    """Fix common table name mistakes in generated SQL."""
+    schema = load_ai_schema()
+    valid_tables = set(schema.keys())
+    sql_lower = sql.lower()
+    sql_corrected = sql
+    # Common table name mappings (case-insensitive replacement)
+    table_corrections = {
+        "visits": "encounters",
+        "visit": "encounters",
+        "providers": "encounters",  # if this table doesn't exist
+    }
+    # Check each correction
+    for wrong_name, correct_name in table_corrections.items():
+        # Only correct if the wrong table doesn't exist AND correct one does
+        if wrong_name.lower() not in valid_tables and correct_name.lower() in valid_tables:
+            # Use word boundaries to avoid partial replacements
+            pattern = r'\b' + re.escape(wrong_name) + r'\b'
+            sql_corrected = re.sub(pattern, correct_name, sql_corrected, flags=re.IGNORECASE)
+    return sql_corrected
 def validate_sql(sql):
     if not sql.lower().startswith("select"):
         raise Exception("Only SELECT allowed")
 def build_table_summary(table_name):
     cur = conn.cursor()
+    # Total rows (still need to query actual data for count)
     total = cur.execute(
         f"SELECT COUNT(*) FROM {table_name}"
     ).fetchone()[0]
+    # Get column info from METADATA (ai_columns) not database structure
+    schema = load_ai_schema()
+    if table_name not in schema:
+        return f"Table {table_name} not found in metadata."
+    columns = schema[table_name]["columns"]  # [(col_name, description), ...]
+    summary = f"Here's a summary of **{table_name}**:\n\n"
     summary += f"• Total records: {total}\n"
+    # Try to summarize categorical columns using metadata
+    for col_name, col_desc in columns:
+        # Try to determine if it's a categorical column based on name/description
+        # Skip likely numeric/date columns
+        col_lower = col_name.lower()
+        if any(skip in col_lower for skip in ["id", "_id", "date", "time", "count", "amount", "price"]):
+            continue
+        # Try to get breakdown for text-like columns
+        try:
+            rows = cur.execute(
+                f"""
+                SELECT {col_name}, COUNT(*)
+                FROM {table_name}
+                GROUP BY {col_name}
+                ORDER BY COUNT(*) DESC
+                LIMIT 5
+                """
+            ).fetchall()
+            if rows:
+                summary += f"\n• {col_name.capitalize()} breakdown:\n"
+                for val, count in rows:
+                    summary += f"  - {val}: {count}\n"
+        except:
+            pass  # ignore columns that can't be grouped
     summary += "\nYou can ask more detailed questions about this data."
         "message": build_table_summary(matched_tables[0]),
         "data": []
     }
+    # Only block if too many tables matched AND it's not an analytical question
+    # Analytical questions (how many, count, etc.) often need multiple tables
+    is_analytical = any(k in q for k in [
+        "how many", "count", "total", "number of",
+        "average", "avg", "sum", "more than", "less than",
+        "compare", "trend"
+    ])
+    if len(matched_tables) > 4 and not is_analytical:
         return {
         "status": "ok",
         "message": (
+            "Your question matches too many datasets:\n"
+            + "\n".join(f"- {t}" for t in matched_tables[:5])
+            + "\n\nPlease be more specific about what you want to know."
         ),
         "data": []
     }
             "data": []
         }
+    # Sanitize, correct table names, then validate
+    sql = sanitize_sql(sql)
+    sql = correct_table_names(sql)
+    sql = validate_sql(sql)
     cols, rows = run_query(sql)
     # ----------------------------------