Spaces:

nilotpaldhar2004
/

text2sql-chatbot

Sleeping

App Files Files Community

nilotpaldhar2004 commited on 17 days ago

Commit

d39d3d8

unverified ·

1 Parent(s): f06394a

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -79

app.py CHANGED Viewed

@@ -1,6 +1,12 @@
 import os
 import re
 import io
 import sqlite3
 import tempfile
 import pandas as pd
@@ -9,44 +15,26 @@ from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 # ── Config ────────────────────────────────────────────────────────────────────
-MODEL_NAME = "defog/sqlcoder-7b-2"
-MAX_NEW_TOKENS = 300
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# ── Memory-Optimized Model Loading ───────────────────────────────────────────
-print(f"[INFO] Loading model: {MODEL_NAME} | device: {DEVICE}")
-print("[INFO] Applying 4-bit quantization to fit within 16Gi RAM limit...")
-# Configure 4-bit quantization for memory efficiency
-quant_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_compute_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
-)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-# Load model with quantization and low memory usage settings
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    quantization_config=quant_config,
-    device_map="auto",
-    low_cpu_mem_usage=True,
-    trust_remote_code=True
-)
 model.eval()
-print("[INFO] Model loaded successfully.")
-# ── In-memory store ────────────────────────────────────────────────────────────
-_db_store: dict[str, bytes] = {}
-_schema_store: dict[str, str] = {}
-app = FastAPI(title="SQLCoder CSV Chat", version="1.1.0")
 app.add_middleware(
     CORSMiddleware,
@@ -56,18 +44,17 @@ app.add_middleware(
 )
 # ── Static frontend ────────────────────────────────────────────────────────────
-# Ensure your index.html is in a folder named 'static'
-if not os.path.exists("static"):
-    os.makedirs("static")
 app.mount("/static", StaticFiles(directory="static"), name="static")
 @app.get("/")
 def root():
     return FileResponse("static/index.html")
 # ── Helpers ────────────────────────────────────────────────────────────────────
 def csv_to_sqlite(df: pd.DataFrame, table_name: str = "data") -> bytes:
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
@@ -78,7 +65,9 @@ def csv_to_sqlite(df: pd.DataFrame, table_name: str = "data") -> bytes:
     os.unlink(tmp_path)
     return db_bytes
 def get_schema(db_bytes: bytes) -> str:
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
@@ -90,50 +79,54 @@ def get_schema(db_bytes: bytes) -> str:
     os.unlink(tmp_path)
     return "\n".join(r[0] for r in rows if r[0])
-def build_prompt(question: str, schema: str) -> str:
-    """SQLCoder specific prompt format for better accuracy."""
-    return f"""### Task
-Generate a SQL query to answer [QUESTION]{question}[/QUESTION]
-### Database Schema
-The query will run on a database with the following schema:
-{schema}
-### Answer
-Given the database schema, here is the SQL query that [QUESTION]{question}[/QUESTION]
-[SQL]
-"""
 def generate_sql(question: str, schema: str) -> str:
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
-    table_name = table_match.group(1) if table_match else "user_data"
-    prompt = build_prompt(question, schema)
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             max_new_tokens=MAX_NEW_TOKENS,
-            do_sample=False,
-            num_beams=1,
-            eos_token_id=tokenizer.eos_token_id,
-            pad_token_id=tokenizer.eos_token_id
         )
-    # Decode newly generated tokens
-    generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
-    sql = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
-    # Post-processing and cleaning
-    sql = sql.split("[/SQL]")[0].strip()
-    sql = re.sub(r"```sql|```", "", sql).strip()
-    sql = re.sub(r'\bFROM\s+(\w+)', f'FROM "{table_name}"', sql, flags=re.IGNORECASE)
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
@@ -143,50 +136,64 @@ def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
         cur = conn.execute(sql)
         rows = [dict(r) for r in cur.fetchall()]
     except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Execution error: {e}")
-    finally:
         conn.close()
         os.unlink(tmp_path)
     return rows
 # ── Routes ─────────────────────────────────────────────────────────────────────
 class QueryRequest(BaseModel):
     session_id: str
     question: str
 @app.post("/upload")
 async def upload_csv(file: UploadFile = File(...)):
     if not file.filename.endswith(".csv"):
-        raise HTTPException(status_code=400, detail="Invalid file type. Upload a CSV.")
     contents = await file.read()
-    df = pd.read_csv(io.BytesIO(contents))
     session_id = os.urandom(8).hex()
-    table_name = "user_data" # Standardized for internal SQL logic
     db_bytes = csv_to_sqlite(df, table_name)
     schema = get_schema(db_bytes)
     _db_store[session_id] = db_bytes
     _schema_store[session_id] = schema
-    return {
         "session_id": session_id,
-        "columns": list(df.columns),
-        "preview": df.head(3).to_dict(orient="records")
-    }
 @app.post("/query")
 async def query(req: QueryRequest):
     if req.session_id not in _db_store:
-        raise HTTPException(status_code=404, detail="Session expired.")
     schema = _schema_store[req.session_id]
     sql = generate_sql(req.question, schema)
     results = execute_sql(sql, _db_store[req.session_id])
-    return {"sql": sql, "results": results}
 @app.get("/health")
 def health():
-    return {"status": "running", "quantization": "4-bit"}

+"""
+app.py — Model: google/flan-t5-large (Text-to-SQL)
+HuggingFace Space: Free Tier (CPU)
+"""
 import os
 import re
 import io
+import json
 import sqlite3
 import tempfile
 import pandas as pd
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 # ── Config ────────────────────────────────────────────────────────────────────
+MODEL_NAME = "cssupport/t5-small-awesome-text-to-sql"   # T5-based text→SQL, CPU-friendly
+MAX_NEW_TOKENS = 256
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# ── Load model once at startup ─────────────────────────────────────────────────
+print(f"[INFO] Loading model: {MODEL_NAME}  |  device: {DEVICE}")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
+print("[INFO] Model ready.")
+# ── In-memory DB store ─────────────────────────────────────────────────────────
+_db_store: dict[str, bytes] = {}   # session_id → sqlite db bytes
+_schema_store: dict[str, str] = {} # session_id → schema string
+app = FastAPI(title="CSV-to-SQL Chat", version="1.0.0")
 app.add_middleware(
     CORSMiddleware,
 )
 # ── Static frontend ────────────────────────────────────────────────────────────
 app.mount("/static", StaticFiles(directory="static"), name="static")
 @app.get("/")
 def root():
     return FileResponse("static/index.html")
 # ── Helpers ────────────────────────────────────────────────────────────────────
 def csv_to_sqlite(df: pd.DataFrame, table_name: str = "data") -> bytes:
+    """Convert DataFrame → SQLite DB bytes."""
+    buf = io.BytesIO()
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
     os.unlink(tmp_path)
     return db_bytes
 def get_schema(db_bytes: bytes) -> str:
+    """Extract CREATE TABLE schema from DB bytes."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
     os.unlink(tmp_path)
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
+    """Run T5 inference to produce SQL."""
+    # Extract table name from schema
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
+    table_name = table_match.group(1) if table_match else "data"
+    quoted = f'"{table_name}"'
+    # Extract column names to inject into prompt — helps T5-small stay grounded
+    col_match = re.findall(r'"(\w+)"', schema)
+    col_hint = ", ".join(col_match) if col_match else ""
+    prompt = f"tables:\n{schema}\ncolumns: {col_hint}\nquery for: {question}"
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=512,
+    ).to(DEVICE)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             max_new_tokens=MAX_NEW_TOKENS,
+            num_beams=4,
+            early_stopping=True,
         )
+    sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+    # Fix 1: replace any FROM/JOIN table reference (quoted or unquoted) with correct table
+    sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
+    sql = re.sub(r'\bJOIN\s+("?\w+"?)', f'JOIN {quoted}', sql, flags=re.IGNORECASE)
+    # Fix 2: strip junk tokens after table name before LIMIT/WHERE/ORDER etc.
+    # e.g. FROM "city_day" Datetime LIMIT 10  →  FROM "city_day" LIMIT 10
+    sql = re.sub(
+        r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|LEFT|RIGHT|INNER|ON|AND|OR|\d)(\w+)',
+        r'\1',
+        sql, flags=re.IGNORECASE
+    )
+    # Fix 3: fallback if no SELECT at all
+    if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
+        sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
+    """Run SQL against the in-memory SQLite DB."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
         cur = conn.execute(sql)
         rows = [dict(r) for r in cur.fetchall()]
     except Exception as e:
         conn.close()
         os.unlink(tmp_path)
+        raise HTTPException(status_code=400, detail=f"SQL error: {e}")
+    conn.close()
+    os.unlink(tmp_path)
     return rows
 # ── Routes ─────────────────────────────────────────────────────────────────────
 class QueryRequest(BaseModel):
     session_id: str
     question: str
 @app.post("/upload")
 async def upload_csv(file: UploadFile = File(...)):
+    """Upload CSV → parse → store as SQLite → return session_id & preview."""
     if not file.filename.endswith(".csv"):
+        raise HTTPException(status_code=400, detail="Only CSV files accepted.")
     contents = await file.read()
+    try:
+        df = pd.read_csv(io.BytesIO(contents))
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"CSV parse error: {e}")
     session_id = os.urandom(8).hex()
+    table_name = re.sub(r"[^a-zA-Z0-9_]", "_", os.path.splitext(file.filename)[0])[:32] or "data"
+    if table_name[0].isdigit():
+        table_name = "t_" + table_name
     db_bytes = csv_to_sqlite(df, table_name)
     schema = get_schema(db_bytes)
     _db_store[session_id] = db_bytes
     _schema_store[session_id] = schema
+    preview = df.head(5).to_dict(orient="records")
+    columns = list(df.columns)
+    return JSONResponse({
         "session_id": session_id,
+        "table_name": table_name,
+        "columns": columns,
+        "row_count": len(df),
+        "preview": preview,
+        "schema": schema,
+    })
 @app.post("/query")
 async def query(req: QueryRequest):
+    """Natural language question → SQL → execute → return results."""
     if req.session_id not in _db_store:
+        raise HTTPException(status_code=404, detail="Session not found. Please upload CSV first.")
     schema = _schema_store[req.session_id]
     sql = generate_sql(req.question, schema)
     results = execute_sql(sql, _db_store[req.session_id])
+    return JSONResponse({"sql": sql, "results": results})
 @app.get("/health")
 def health():
+    return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}