Spaces:

nilotpaldhar2004
/

text2sql-chatbot

Sleeping

App Files Files Community

nilotpaldhar2004 commited on 25 days ago

Commit

7073cc4

unverified ·

1 Parent(s): 3d13366

Refactor app.py for model update and code clarity

Browse files

Updated model name and improved comments for clarity. Adjusted table name handling and SQL generation logic.

Files changed (1) hide show

app.py +55 -48

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-app.py — Model: T5-Small (Text-to-SQL)
 HuggingFace Space: Free Tier (CPU)
 """
@@ -19,20 +19,20 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 # ── Config ────────────────────────────────────────────────────────────────────
-MODEL_NAME = "cssupport/t5-small-awesome-text-to-sql"
 MAX_NEW_TOKENS = 256
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ── Load model once at startup ─────────────────────────────────────────────────
-print(f"[INFO] Loading model: {MODEL_NAME} | device: {DEVICE}")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
 print("[INFO] Model ready.")
 # ── In-memory DB store ─────────────────────────────────────────────────────────
-_db_store: dict[str, bytes] = {}
-_schema_store: dict[str, str] = {}
 app = FastAPI(title="CSV-to-SQL Chat", version="1.0.0")
@@ -50,21 +50,22 @@ app.mount("/static", StaticFiles(directory="static"), name="static")
 def root():
     return FileResponse("static/index.html")
 # ── Helpers ────────────────────────────────────────────────────────────────────
 def csv_to_sqlite(df: pd.DataFrame, table_name: str = "data") -> bytes:
     """Convert DataFrame → SQLite DB bytes."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
-    # Ensure table name is clean
-    clean_table = re.sub(r"[^a-zA-Z0-9_]", "_", table_name)
-    df.to_sql(clean_table, conn, if_exists="replace", index=False)
     conn.close()
     with open(tmp_path, "rb") as f:
         db_bytes = f.read()
     os.unlink(tmp_path)
     return db_bytes
 def get_schema(db_bytes: bytes) -> str:
     """Extract CREATE TABLE schema from DB bytes."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
@@ -78,52 +79,57 @@ def get_schema(db_bytes: bytes) -> str:
     os.unlink(tmp_path)
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
-    """Run T5 inference with strict case-sensitivity fixes."""
-    # 1. Force lowercase table name detection from schema
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
-    # We explicitly lowercase this to match the SQLite storage
-    table_name = table_match.group(1).lower() if table_match else "city_day"
     quoted = f'"{table_name}"'
-    # 2. Build the prompt with explicit lowercase hints
     col_match = re.findall(r'"(\w+)"', schema)
     col_hint = ", ".join(col_match) if col_match else ""
-    prompt = f"Translate English to SQL: {question} | Table: {table_name} | Columns: {col_hint}"
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
     with torch.no_grad():
-        outputs = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, num_beams=4, early_stopping=True)
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-    # --- THE CRITICAL FIXES ---
-    # Fix 1: Force the table name to be the lowercase version we found in Step 1
-    # This stops the "City_day" vs "city_day" conflict.
-    sql = re.sub(r'\bFROM\s+["\w]+', f'FROM {quoted}', sql, flags=re.IGNORECASE)
-    sql = re.sub(r'\bJOIN\s+["\w]+', f'JOIN {quoted}', sql, flags=re.IGNORECASE)
-    # Fix 2: Remove junk tokens that T5 inserts after the table name
-    sql = re.sub(r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|ON|AND|OR|UNION)(\w+)', r'\1', sql, flags=re.IGNORECASE)
-    # Fix 3: Standardize common column case issues
-    # If the model writes "City", we make sure it matches the schema's "City"
-    for col in col_match:
-        sql = re.sub(rf'\b{col}\b', f'"{col}"', sql, flags=re.IGNORECASE)
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
     """Run SQL against the in-memory SQLite DB."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
     conn.row_factory = sqlite3.Row
     try:
@@ -132,23 +138,23 @@ def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
     except Exception as e:
         conn.close()
         os.unlink(tmp_path)
-        # Return error as a list to be caught by JSONResponse
-        raise HTTPException(status_code=400, detail=f"SQL error: {str(e)}")
     conn.close()
     os.unlink(tmp_path)
     return rows
 # ── Routes ─────────────────────────────────────────────────────────────────────
 class QueryRequest(BaseModel):
     session_id: str
     question: str
 @app.post("/upload")
 async def upload_csv(file: UploadFile = File(...)):
     if not file.filename.endswith(".csv"):
         raise HTTPException(status_code=400, detail="Only CSV files accepted.")
     contents = await file.read()
     try:
         df = pd.read_csv(io.BytesIO(contents))
@@ -156,38 +162,39 @@ async def upload_csv(file: UploadFile = File(...)):
         raise HTTPException(status_code=400, detail=f"CSV parse error: {e}")
     session_id = os.urandom(8).hex()
-    # Clean table name from filename
-    raw_name = os.path.splitext(file.filename)[0]
-    table_name = re.sub(r"[^a-zA-Z0-9_]", "_", raw_name)[:32] or "data"
     db_bytes = csv_to_sqlite(df, table_name)
     schema = get_schema(db_bytes)
     _db_store[session_id] = db_bytes
     _schema_store[session_id] = schema
     return JSONResponse({
         "session_id": session_id,
         "table_name": table_name,
-        "columns": list(df.columns),
         "row_count": len(df),
-        "preview": df.head(5).to_dict(orient="records"),
         "schema": schema,
     })
 @app.post("/query")
 async def query(req: QueryRequest):
     if req.session_id not in _db_store:
-        raise HTTPException(status_code=404, detail="Session not found. Upload CSV first.")
     schema = _schema_store[req.session_id]
     sql = generate_sql(req.question, schema)
-    # This is where your previous code was likely failing
     results = execute_sql(sql, _db_store[req.session_id])
     return JSONResponse({"sql": sql, "results": results})
 @app.get("/health")
 def health():
     return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}

 """
+app.py — Model: google/flan-t5-large (Text-to-SQL)
 HuggingFace Space: Free Tier (CPU)
 """
 import torch
 # ── Config ────────────────────────────────────────────────────────────────────
+MODEL_NAME = "cssupport/t5-small-awesome-text-to-sql"   # T5-based text→SQL, CPU-friendly
 MAX_NEW_TOKENS = 256
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ── Load model once at startup ─────────────────────────────────────────────────
+print(f"[INFO] Loading model: {MODEL_NAME}  |  device: {DEVICE}")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
 print("[INFO] Model ready.")
 # ── In-memory DB store ─────────────────────────────────────────────────────────
+_db_store: dict[str, bytes] = {}   # session_id → sqlite db bytes
+_schema_store: dict[str, str] = {} # session_id → schema string
 app = FastAPI(title="CSV-to-SQL Chat", version="1.0.0")
 def root():
     return FileResponse("static/index.html")
 # ── Helpers ────────────────────────────────────────────────────────────────────
 def csv_to_sqlite(df: pd.DataFrame, table_name: str = "data") -> bytes:
     """Convert DataFrame → SQLite DB bytes."""
+    buf = io.BytesIO()
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
+    df.to_sql(table_name, conn, if_exists="replace", index=False)
     conn.close()
     with open(tmp_path, "rb") as f:
         db_bytes = f.read()
     os.unlink(tmp_path)
     return db_bytes
 def get_schema(db_bytes: bytes) -> str:
     """Extract CREATE TABLE schema from DB bytes."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
     os.unlink(tmp_path)
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
+    """Run T5 inference to produce SQL."""
+    # Extract table name from schema
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
+    table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
+    # Extract column names to inject into prompt — helps T5-small stay grounded
     col_match = re.findall(r'"(\w+)"', schema)
     col_hint = ", ".join(col_match) if col_match else ""
+    prompt = f"tables:\n{schema}\ncolumns: {col_hint}\nquery for: {question}"
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=512,
+    ).to(DEVICE)
     with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+            num_beams=4,
+            early_stopping=True,
+        )
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+    # Fix 1: replace any FROM/JOIN table reference (quoted or unquoted) with correct table
+    sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
+    sql = re.sub(r'\bJOIN\s+("?\w+"?)', f'JOIN {quoted}', sql, flags=re.IGNORECASE)
+    # Fix 2: strip junk tokens after table name before LIMIT/WHERE/ORDER etc.
+    # e.g. FROM "city_day" Datetime LIMIT 10  →  FROM "city_day" LIMIT 10
+    sql = re.sub(
+        r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|LEFT|RIGHT|INNER|ON|AND|OR|\d)(\w+)',
+        r'\1',
+        sql, flags=re.IGNORECASE
+    )
+    # Fix 3: fallback if no SELECT at all
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
     """Run SQL against the in-memory SQLite DB."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
     conn.row_factory = sqlite3.Row
     try:
     except Exception as e:
         conn.close()
         os.unlink(tmp_path)
+        raise HTTPException(status_code=400, detail=f"SQL error: {e}")
     conn.close()
     os.unlink(tmp_path)
     return rows
 # ── Routes ─────────────────────────────────────────────────────────────────────
 class QueryRequest(BaseModel):
     session_id: str
     question: str
 @app.post("/upload")
 async def upload_csv(file: UploadFile = File(...)):
+    """Upload CSV → parse → store as SQLite → return session_id & preview."""
     if not file.filename.endswith(".csv"):
         raise HTTPException(status_code=400, detail="Only CSV files accepted.")
     contents = await file.read()
     try:
         df = pd.read_csv(io.BytesIO(contents))
         raise HTTPException(status_code=400, detail=f"CSV parse error: {e}")
     session_id = os.urandom(8).hex()
+    table_name = re.sub(r"[^a-zA-Z0-9_]", "_", os.path.splitext(file.filename)[0])[:32] or "data"
+    if table_name[0].isdigit():
+        table_name = "t_" + table_name
     db_bytes = csv_to_sqlite(df, table_name)
     schema = get_schema(db_bytes)
     _db_store[session_id] = db_bytes
     _schema_store[session_id] = schema
+    preview = df.head(5).to_dict(orient="records")
+    columns = list(df.columns)
     return JSONResponse({
         "session_id": session_id,
         "table_name": table_name,
+        "columns": columns,
         "row_count": len(df),
+        "preview": preview,
         "schema": schema,
     })
 @app.post("/query")
 async def query(req: QueryRequest):
+    """Natural language question → SQL → execute → return results."""
     if req.session_id not in _db_store:
+        raise HTTPException(status_code=404, detail="Session not found. Please upload CSV first.")
     schema = _schema_store[req.session_id]
     sql = generate_sql(req.question, schema)
     results = execute_sql(sql, _db_store[req.session_id])
     return JSONResponse({"sql": sql, "results": results})
 @app.get("/health")
 def health():
     return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}