Spaces:

nilotpaldhar2004
/

text2sql-chatbot

Sleeping

App Files Files Community

nilotpaldhar2004 commited on 24 days ago

Commit

2ea4813

verified ·

1 Parent(s): 4ebb141

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -25

app.py CHANGED Viewed

@@ -81,53 +81,38 @@ def get_schema(db_bytes: bytes) -> str:
 def generate_sql(question: str, schema: str) -> str:
-    """
-    Enhanced Hybrid SQL Engine.
-    Priority 1: Smart Regex (Deterministic & Instant)
-    Priority 2: T5 Transformer (Probabilistic Fallback)
-    """
-    # 1. Context Extraction
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
     col_match = re.findall(r'"(\w+)"', schema)
     q = question.lower().strip()
     # 2. Smart Column Detection
-    # Searches for a column name from the schema within the user's question
     target_col = None
     for col in col_match:
         if col.lower() in q:
             target_col = col
             break
-    # 3. Enhanced Rule-Based Shortcuts (Smart Logic)
-    # DISTINCT/UNIQUE COUNT
     if re.search(r'unique|distinct', q):
         col = target_col if target_col else (col_match[0] if col_match else "*")
         return f'SELECT COUNT(DISTINCT "{col}") FROM {quoted}'
-    # GROUP BY
     if re.search(r'group.*by|per|each', q):
         col = target_col if target_col else (col_match[0] if col_match else "data")
         return f'SELECT "{col}", COUNT(*) FROM {quoted} GROUP BY "{col}"'
-    # AVERAGE (With semantic fallback for your city_day dataset)
-    if re.search(r'average|avg|mean', q):
-        num_col = target_col if target_col else next((c for c in col_match if re.search(r'pm|aqi|no|co|so|o3|benzene|val|amt', c, re.I)), col_match[2] if len(col_match)>2 else col_match[0])
-        return f'SELECT AVG("{num_col}") FROM {quoted}'
-    # TOTAL RECORDS
-    if re.search(r'count.*(total|all|record|row)|total.*(record|row|count)|how many', q):
         return f'SELECT COUNT(*) FROM {quoted}'
-    # LIMIT/TOP ROWS
     if re.search(r'show|display|get|first|top', q):
         n_match = re.search(r'\d+', q)
-        limit = n_match.group() if n_match else 10
-        return f'SELECT * FROM {quoted} LIMIT {limit}'
     # 4. T5 Model Fallback
     col_hint = ", ".join(col_match) if col_match else ""
@@ -139,10 +124,16 @@ def generate_sql(question: str, schema: str) -> str:
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-    # Post-inference cleaning (Crucial for SQLite stability)
-    sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
-    sql = re.sub(r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|ON|AND|OR)(\w+)', r'\1', sql, flags=re.IGNORECASE)
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'

 def generate_sql(question: str, schema: str) -> str:
+    # 1. Context Extraction (Same as before)
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
     col_match = re.findall(r'"(\w+)"', schema)
     q = question.lower().strip()
     # 2. Smart Column Detection
     target_col = None
     for col in col_match:
         if col.lower() in q:
             target_col = col
             break
+    # 3. Enhanced Rule-Based Shortcuts (Deterministic)
     if re.search(r'unique|distinct', q):
         col = target_col if target_col else (col_match[0] if col_match else "*")
         return f'SELECT COUNT(DISTINCT "{col}") FROM {quoted}'
     if re.search(r'group.*by|per|each', q):
         col = target_col if target_col else (col_match[0] if col_match else "data")
         return f'SELECT "{col}", COUNT(*) FROM {quoted} GROUP BY "{col}"'
+    if re.search(r'count.*(total|all|record|row|paris)|how many', q):
+        # Special case for "Count the Paris" -> We search for 'Paris' in all columns
+        if "paris" in q:
+            return f'SELECT COUNT(*) FROM {quoted} WHERE "answer" LIKE "%Paris%" OR "question" LIKE "%Paris%"'
         return f'SELECT COUNT(*) FROM {quoted}'
     if re.search(r'show|display|get|first|top', q):
         n_match = re.search(r'\d+', q)
+        return f'SELECT * FROM {quoted} LIMIT {n_match.group() if n_match else 10}'
     # 4. T5 Model Fallback
     col_hint = ", ".join(col_match) if col_match else ""
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+    # ── CRITICAL CLEANING GUARDRAIL ──
+    # This removes hallucinations like "Table | SQL | Columns" from the output
+    if "|" in sql:
+        sql = sql.split("|")[-1].strip() # Take only the part after the last pipe
+    # Remove common prefix hallucinations
+    sql = re.sub(r'^(sql|query|result|table):', '', sql, flags=re.IGNORECASE).strip()
+    # Force Table and SELECT
+    sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'