Spaces:

nilotpaldhar2004
/

text2sql-chatbot

Sleeping

App Files Files Community

nilotpaldhar2004 commited on 25 days ago

Commit

3d13366

unverified ·

1 Parent(s): e870039

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -8

app.py CHANGED Viewed

@@ -79,14 +79,16 @@ def get_schema(db_bytes: bytes) -> str:
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
-    """Run T5 inference with enhanced regex fixes."""
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
-    table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
     col_match = re.findall(r'"(\w+)"', schema)
     col_hint = ", ".join(col_match) if col_match else ""
     prompt = f"Translate English to SQL: {question} | Table: {table_name} | Columns: {col_hint}"
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
@@ -96,12 +98,20 @@ def generate_sql(question: str, schema: str) -> str:
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-    # FIXES
-    sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
-    sql = re.sub(r'\bJOIN\s+("?\w+"?)', f'JOIN {quoted}', sql, flags=re.IGNORECASE)
-    # Strip trailing junk
-    sql = re.sub(r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|LEFT|RIGHT|INNER|ON|AND|OR|\d)(\w+)', r'\1', sql, flags=re.IGNORECASE)
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'

     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
+    """Run T5 inference with strict case-sensitivity fixes."""
+    # 1. Force lowercase table name detection from schema
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
+    # We explicitly lowercase this to match the SQLite storage
+    table_name = table_match.group(1).lower() if table_match else "city_day"
     quoted = f'"{table_name}"'
+    # 2. Build the prompt with explicit lowercase hints
     col_match = re.findall(r'"(\w+)"', schema)
     col_hint = ", ".join(col_match) if col_match else ""
     prompt = f"Translate English to SQL: {question} | Table: {table_name} | Columns: {col_hint}"
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+    # --- THE CRITICAL FIXES ---
+    # Fix 1: Force the table name to be the lowercase version we found in Step 1
+    # This stops the "City_day" vs "city_day" conflict.
+    sql = re.sub(r'\bFROM\s+["\w]+', f'FROM {quoted}', sql, flags=re.IGNORECASE)
+    sql = re.sub(r'\bJOIN\s+["\w]+', f'JOIN {quoted}', sql, flags=re.IGNORECASE)
+    # Fix 2: Remove junk tokens that T5 inserts after the table name
+    sql = re.sub(r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|ON|AND|OR|UNION)(\w+)', r'\1', sql, flags=re.IGNORECASE)
+    # Fix 3: Standardize common column case issues
+    # If the model writes "City", we make sure it matches the schema's "City"
+    for col in col_match:
+        sql = re.sub(rf'\b{col}\b', f'"{col}"', sql, flags=re.IGNORECASE)
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'