Spaces:

nilotpaldhar2004
/

text2sql-chatbot

Running

App Files Files Community

nilotpaldhar2004 commited on 23 days ago

Commit

af59526

unverified ·

1 Parent(s): 7c20e07

Update model to defog/sqlcoder-7b-2 and adjust settings

Browse files

Files changed (1) hide show

app.py +59 -27

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """
-app.py — Model: google/flan-t5-large (Text-to-SQL)
-HuggingFace Space: Free Tier (CPU)
 """
 import os
@@ -15,26 +17,40 @@ from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 # ── Config ────────────────────────────────────────────────────────────────────
-MODEL_NAME = "cssupport/t5-small-awesome-text-to-sql"   # T5-based text→SQL, CPU-friendly
-MAX_NEW_TOKENS = 256
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# ── Load model once at startup ─────────────────────────────────────────────────
 print(f"[INFO] Loading model: {MODEL_NAME}  |  device: {DEVICE}")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
 print("[INFO] Model ready.")
-# ── In-memory DB store ─────────────────────────────────────────────────────────
-_db_store: dict[str, bytes] = {}   # session_id → sqlite db bytes
-_schema_store: dict[str, str] = {} # session_id → schema string
-app = FastAPI(title="CSV-to-SQL Chat", version="1.0.0")
 app.add_middleware(
     CORSMiddleware,
@@ -43,7 +59,6 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# ── Static frontend ────────────────────────────────────────────────────────────
 app.mount("/static", StaticFiles(directory="static"), name="static")
 @app.get("/")
@@ -53,8 +68,6 @@ def root():
 # ── Helpers ────────────────────────────────────────────────────────────────────
 def csv_to_sqlite(df: pd.DataFrame, table_name: str = "data") -> bytes:
-    """Convert DataFrame → SQLite DB bytes."""
-    buf = io.BytesIO()
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
@@ -67,7 +80,6 @@ def csv_to_sqlite(df: pd.DataFrame, table_name: str = "data") -> bytes:
 def get_schema(db_bytes: bytes) -> str:
-    """Extract CREATE TABLE schema from DB bytes."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
@@ -80,42 +92,65 @@ def get_schema(db_bytes: bytes) -> str:
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
-    """Run T5 inference to produce SQL."""
     # Extract table name from schema
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
-    prompt = f"tables:\n{schema}\nquery for: {question}"
     inputs = tokenizer(
         prompt,
         return_tensors="pt",
         truncation=True,
-        max_length=512,
     ).to(DEVICE)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             max_new_tokens=MAX_NEW_TOKENS,
             num_beams=4,
             early_stopping=True,
         )
-    sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-    # Fix 1: replace any FROM/JOIN table reference (quoted or unquoted) with correct table
     sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
     sql = re.sub(r'\bJOIN\s+("?\w+"?)', f'JOIN {quoted}', sql, flags=re.IGNORECASE)
-    # Fix 2: strip junk tokens after table name before LIMIT/WHERE/ORDER etc.
-    # e.g. FROM "city_day" Datetime LIMIT 10  →  FROM "city_day" LIMIT 10
     sql = re.sub(
         r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|LEFT|RIGHT|INNER|ON|AND|OR|\d)(\w+)',
         r'\1',
         sql, flags=re.IGNORECASE
     )
-    # Fix 3: fallback if no SELECT at all
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'
@@ -123,7 +158,6 @@ def generate_sql(question: str, schema: str) -> str:
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
-    """Run SQL against the in-memory SQLite DB."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
@@ -149,7 +183,6 @@ class QueryRequest(BaseModel):
 @app.post("/upload")
 async def upload_csv(file: UploadFile = File(...)):
-    """Upload CSV → parse → store as SQLite → return session_id & preview."""
     if not file.filename.endswith(".csv"):
         raise HTTPException(status_code=400, detail="Only CSV files accepted.")
     contents = await file.read()
@@ -182,9 +215,8 @@ async def upload_csv(file: UploadFile = File(...)):
 @app.post("/query")
 async def query(req: QueryRequest):
-    """Natural language question → SQL → execute → return results."""
     if req.session_id not in _db_store:
-        raise HTTPException(status_code=404, detail="Session not found. Please upload CSV first.")
     schema = _schema_store[req.session_id]
     sql = generate_sql(req.question, schema)
     results = execute_sql(sql, _db_store[req.session_id])

 """
+app.py — Model: defog/sqlcoder-7b-2 (Text-to-SQL)
+HuggingFace Space: Free Tier  (needs GPU Space or patience on CPU)
+NOTE: 7B model — use HF Spaces with GPU (T4 small) if available.
+      On CPU it will be slow (~60-120s per query) but will work.
 """
 import os
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 # ── Config ────────────────────────────────────────────────────────────────────
+MODEL_NAME = "defog/sqlcoder-7b-2"
+MAX_NEW_TOKENS = 300
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+LOAD_IN_8BIT = False   # set True if bitsandbytes is available on GPU space
+# ── Load model once ────────────────────────────────────────────────────────────
 print(f"[INFO] Loading model: {MODEL_NAME}  |  device: {DEVICE}")
+print("[INFO] This may take a few minutes on first load...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model_kwargs = {
+    "torch_dtype": torch.float16 if DEVICE == "cuda" else torch.float32,
+    "device_map": "auto" if DEVICE == "cuda" else None,
+    "low_cpu_mem_usage": True,
+}
+if LOAD_IN_8BIT and DEVICE == "cuda":
+    model_kwargs["load_in_8bit"] = True
+model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, **model_kwargs)
+if DEVICE == "cpu":
+    model = model.to(DEVICE)
 model.eval()
 print("[INFO] Model ready.")
+# ── In-memory store ────────────────────────────────────────────────────────────
+_db_store: dict[str, bytes] = {}
+_schema_store: dict[str, str] = {}
+app = FastAPI(title="CSV-to-SQL Chat (SQLCoder-7B)", version="1.0.0")
 app.add_middleware(
     CORSMiddleware,
     allow_headers=["*"],
 )
 app.mount("/static", StaticFiles(directory="static"), name="static")
 @app.get("/")
 # ── Helpers ────────────────────────────────────────────────────────────────────
 def csv_to_sqlite(df: pd.DataFrame, table_name: str = "data") -> bytes:
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
 def get_schema(db_bytes: bytes) -> str:
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
     return "\n".join(r[0] for r in rows if r[0])
+def build_prompt(question: str, schema: str) -> str:
+    """SQLCoder uses a specific prompt format."""
+    return f"""### Task
+Generate a SQL query to answer [QUESTION]{question}[/QUESTION]
+### Database Schema
+The query will run on a database with the following schema:
+{schema}
+### Answer
+Given the database schema, here is the SQL query that [QUESTION]{question}[/QUESTION]
+[SQL]
+"""
 def generate_sql(question: str, schema: str) -> str:
     # Extract table name from schema
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
+    prompt = build_prompt(question, schema)
     inputs = tokenizer(
         prompt,
         return_tensors="pt",
         truncation=True,
+        max_length=1024,
     ).to(DEVICE)
+    eos_token_id = tokenizer.eos_token_id
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             max_new_tokens=MAX_NEW_TOKENS,
             num_beams=4,
             early_stopping=True,
+            pad_token_id=eos_token_id,
         )
+    # Decode only newly generated tokens
+    generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
+    sql = tokenizer.decode(generated_ids, skip_special_tokens=True)
+    # Clean SQLCoder artifacts
+    sql = sql.split("[/SQL]")[0].strip()
+    sql = re.sub(r"```sql|```", "", sql).strip()
+    # Fix 1: replace any FROM/JOIN table reference with correct table
     sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
     sql = re.sub(r'\bJOIN\s+("?\w+"?)', f'JOIN {quoted}', sql, flags=re.IGNORECASE)
+    # Fix 2: strip junk tokens after table name
     sql = re.sub(
         r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|LEFT|RIGHT|INNER|ON|AND|OR|\d)(\w+)',
         r'\1',
         sql, flags=re.IGNORECASE
     )
+    # Fix 3: fallback if no SELECT
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
 @app.post("/upload")
 async def upload_csv(file: UploadFile = File(...)):
     if not file.filename.endswith(".csv"):
         raise HTTPException(status_code=400, detail="Only CSV files accepted.")
     contents = await file.read()
 @app.post("/query")
 async def query(req: QueryRequest):
     if req.session_id not in _db_store:
+        raise HTTPException(status_code=404, detail="Session not found. Upload CSV first.")
     schema = _schema_store[req.session_id]
     sql = generate_sql(req.question, schema)
     results = execute_sql(sql, _db_store[req.session_id])