Spaces:

nilotpaldhar2004
/

text2sql-chatbot

Sleeping

App Files Files Community

nilotpaldhar2004 commited on 23 days ago

Commit

1d456a9

verified ·

1 Parent(s): 822614c

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -46

app.py CHANGED Viewed

@@ -81,16 +81,11 @@ def get_schema(db_bytes: bytes) -> str:
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
-    """
-    Dual-Stream SQL Generation:
-    1. Deterministic (Regex) - Matches common analysis patterns.
-    2. Probabilistic (T5) - Handles complex phrasing as fallback.
-    """
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
     col_match = re.findall(r'"(\w+)"', schema)
     q = question.lower().strip()
     # Smart Column Detection
@@ -100,54 +95,38 @@ def generate_sql(question: str, schema: str) -> str:
             target_col = col
             break
-    # ── Deterministic Layer ──
-    # DISTINCT/UNIQUE
     if re.search(r'unique|distinct', q):
-        col = target_col if target_col else (col_match[0] if col_match else "*")
         return f'SELECT COUNT(DISTINCT "{col}") FROM {quoted}'
     # GROUP BY
     if re.search(r'group.*by|per|each', q):
-        col = target_col if target_col else (col_match[0] if col_match else "data")
         return f'SELECT "{col}", COUNT(*) FROM {quoted} GROUP BY "{col}"'
-    # AVERAGE
-    if re.search(r'average|mean|avg', q):
-        num_col = target_col if target_col else next((c for c in col_match if re.search(r'pm|aqi|no|co|so|o3|benzene|val|amt', c, re.I)), col_match[0])
-        return f'SELECT AVG("{num_col}") FROM {quoted}'
-    # COUNT
-    if re.search(r'count|total|how many', q):
-        if target_col and len(q.split()) > 2:
-            return f'SELECT COUNT(*) FROM {quoted} WHERE "{target_col}" LIKE "%{q.split()[-1]}%"'
-        return f'SELECT COUNT(*) FROM {quoted}'
-    # LIMIT
-    if re.search(r'show|display|get|first|top', q):
-        n_match = re.search(r'\d+', q)
-        limit = n_match.group() if n_match else 10
-        return f'SELECT * FROM {quoted} LIMIT {limit}'
-    # ── Probabilistic Fallback ──
-    col_hint = ", ".join(col_match) if col_match else ""
-    prompt = f"Translate English to SQL: {question} | Table: {table_name} | Columns: {col_hint}"
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
-    with torch.no_grad():
-        outputs = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, num_beams=4, early_stopping=True)
-    sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-    # Output Sanitization
-    if "|" in sql: sql = sql.split("|")[-1].strip()
-    sql = re.sub(r'^(sql|query|table):', '', sql, flags=re.IGNORECASE).strip()
-    sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
-    sql = re.sub(r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|ON|AND|OR)(\w+)', r'\1', sql, flags=re.IGNORECASE)
-    if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
-        sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:

     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
+    # 1. Context Extraction
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
     col_match = re.findall(r'"(\w+)"', schema)
     q = question.lower().strip()
     # Smart Column Detection
             target_col = col
             break
+    # 2. Advanced Rule-Based Shortcuts
+    # FILTERING (e.g., "is Paris", "where answer is Paris")
+    if "is" in q or "=" in q:
+        # Extract the value (e.g., "Paris")
+        value_match = re.search(r"is\s+(['\"]?\w+['\"]?)", q)
+        if value_match:
+            val = value_match.group(1).strip("'\"")
+            # If "question" is in the text, user probably wants the question for that answer
+            select_col = col_match[0] if "question" in q else "*"
+            filter_col = target_col if target_col else col_match[1]
+            return f'SELECT "{select_col}" FROM {quoted} WHERE "{filter_col}" LIKE "%{val}%"'
+    # SELECT DISTINCT (List the names) vs COUNT DISTINCT (How many)
     if re.search(r'unique|distinct', q):
+        col = target_col if target_col else col_match[0]
+        if re.search(r'show|list|get|give|what are', q):
+            return f'SELECT DISTINCT "{col}" FROM {quoted} LIMIT 50'
         return f'SELECT COUNT(DISTINCT "{col}") FROM {quoted}'
+    # SPECIFIC COLUMN SELECTION (e.g., "show all answers")
+    if re.search(r'show|list|get', q) and target_col:
+        if not re.search(r'count|avg|mean|sum', q):
+            return f'SELECT "{target_col}" FROM {quoted} LIMIT 50'
     # GROUP BY
     if re.search(r'group.*by|per|each', q):
+        col = target_col if target_col else col_match[0]
         return f'SELECT "{col}", COUNT(*) FROM {quoted} GROUP BY "{col}"'
+    # 3. T5 Fallback (Existing logic)
+    # ... [Keep your T5 code and Sanitization here] ...
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]: