Spaces:

nilotpaldhar2004
/

text2sql-chatbot

Sleeping

App Files Files Community

nilotpaldhar2004 commited on 23 days ago

Commit

fff6817

verified ·

1 Parent(s): 1e5473b

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -91

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
 """
-QueryMind — CSV-to-SQL Engine (Final Production Version)
-Model: T5-Small Hybrid (Regex + Transformer)
-Hardware: HuggingFace Free Tier (CPU)
 """
 import os
 import re
 import io
-import json
 import sqlite3
 import tempfile
 import pandas as pd
@@ -16,28 +15,31 @@ from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 # ── Configuration ─────────────────────────────────────────────────────────────
-MODEL_NAME = "cssupport/t5-small-awesome-text-to-sql"
-MAX_NEW_TOKENS = 256
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ── Model Initialization ──────────────────────────────────────────────────────
-print(f"[INFO] Loading model: {MODEL_NAME} | device: {DEVICE}")
-# CRITICAL: use_fast=False fixes the sentencepiece/backend tokenizer error on CPU
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
-model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
-print("[INFO] Model ready.")
-# ── State Management (In-Memory) ──────────────────────────────────────────────
-_db_store: dict[str, bytes] = {}   # session_id -> sqlite db bytes
-_schema_store: dict[str, str] = {} # session_id -> create table schema
-app = FastAPI(title="QueryMind Engine", version="1.1.0")
 app.add_middleware(
     CORSMiddleware,
@@ -46,7 +48,6 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# ── Static Frontend ───────────────────────────────────────────────────────────
 app.mount("/static", StaticFiles(directory="static"), name="static")
 @app.get("/")
@@ -55,7 +56,6 @@ def root():
 # ── Logic Helpers ──────────────────────────────────────────────────────────────
 def csv_to_sqlite(df: pd.DataFrame, table_name: str) -> bytes:
-    """Converts Pandas DataFrame into a portable SQLite binary blob."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
@@ -68,7 +68,6 @@ def csv_to_sqlite(df: pd.DataFrame, table_name: str) -> bytes:
     return db_bytes
 def get_schema(db_bytes: bytes) -> str:
-    """Extracts the SQL schema used to create the table."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
@@ -80,76 +79,49 @@ def get_schema(db_bytes: bytes) -> str:
     os.unlink(tmp_path)
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
-    # 1. Context Extraction
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
-    quoted = f'"{table_name}"'
-    col_match = re.findall(r'"(\w+)"', schema)
-    q = question.lower().strip()
-    # Smart Column Detection
-    target_col = None
-    for col in col_match:
-        if col.lower() in q:
-            target_col = col
-            break
-    # 2. Advanced Rule-Based Shortcuts
-    # FILTERING (e.g., "ans is Asia")
-    if "is" in q or "where" in q:
-        # Improved value extraction: look for the last word in the sentence
-        words = q.split()
-        val = words[-1].strip("?.!")
-        # Determine columns
-        select_col = col_match[0] if "question" in q else "*"
-        filter_col = target_col if target_col else (col_match[1] if len(col_match)>1 else col_match[0])
-        # Don't trigger if the 'value' is just a common instruction word
-        if val not in ["null", "not", "the", "average", "rows"]:
-            return f'SELECT "{select_col}" FROM {quoted} WHERE "{filter_col}" LIKE "%{val}%"'
-    # SELECT DISTINCT (List) vs COUNT DISTINCT (Number)
-    if re.search(r'unique|distinct', q):
-        col = target_col if target_col else col_match[0]
-        if re.search(r'show|list|get|give', q):
-            return f'SELECT DISTINCT "{col}" FROM {quoted} LIMIT 50'
-        return f'SELECT COUNT(DISTINCT "{col}") FROM {quoted}'
-    # AGGREGATIONS
-    if re.search(r'average|mean|avg', q):
-        num_col = target_col if target_col else (col_match[1] if len(col_match)>1 else col_match[0])
-        return f'SELECT AVG("{num_col}") FROM {quoted}'
-    # LIMIT/SHOW
-    if re.search(r'show|display|get|first|top', q) and not target_col:
-        n_match = re.search(r'\d+', q)
-        return f'SELECT * FROM {quoted} LIMIT {n_match.group() if n_match else 10}'
-    # 3. Transformer Fallback (MANDATORY FIX)
-    # Ensure this part is NOT skipped
-    col_hint = ", ".join(col_match) if col_match else ""
-    prompt = f"Translate English to SQL: {question} | Table: {table_name} | Columns: {col_hint}"
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
     with torch.no_grad():
-        outputs = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS, num_beams=4, early_stopping=True)
-    sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-    # Sanitization
-    if "|" in sql: sql = sql.split("|")[-1].strip()
-    sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
-    if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
-        sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
-    """Runs SQL against the binary blob via a temporary SQLite instance."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
@@ -161,7 +133,7 @@ def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
     except Exception as e:
         conn.close()
         os.unlink(tmp_path)
-        raise HTTPException(status_code=400, detail=f"SQL error: {e}")
     conn.close()
     os.unlink(tmp_path)
     return rows
@@ -177,10 +149,7 @@ async def upload_csv(file: UploadFile = File(...)):
         raise HTTPException(status_code=400, detail="Only CSV files accepted.")
     contents = await file.read()
-    try:
-        df = pd.read_csv(io.BytesIO(contents))
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"CSV parse error: {e}")
     session_id = os.urandom(8).hex()
     raw_name = os.path.splitext(file.filename)[0]
@@ -188,24 +157,21 @@ async def upload_csv(file: UploadFile = File(...)):
     if table_name[0].isdigit(): table_name = "t_" + table_name
     db_bytes = csv_to_sqlite(df, table_name)
-    schema = get_schema(db_bytes)
     _db_store[session_id] = db_bytes
-    _schema_store[session_id] = schema
     return JSONResponse({
         "session_id": session_id,
-        "table_name": table_name,
         "columns": list(df.columns),
         "row_count": len(df),
         "preview": df.head(5).to_dict(orient="records"),
-        "schema": schema,
     })
 @app.post("/query")
 async def query(req: QueryRequest):
     if req.session_id not in _db_store:
-        raise HTTPException(status_code=404, detail="Session expired or not found.")
     schema = _schema_store[req.session_id]
     sql = generate_sql(req.question, schema)
@@ -215,4 +181,4 @@ async def query(req: QueryRequest):
 @app.get("/health")
 def health():
-    return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}

 """
+QueryMind — CSV-to-SQL Engine (High Performance 7B Version)
+Model: SQLCoder-7B-2 (State-of-the-art Text-to-SQL)
+Hardware: HuggingFace ZeroGPU (Free A100 Tier)
 """
 import os
 import re
 import io
 import sqlite3
 import tempfile
 import pandas as pd
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+import spaces # Required for HuggingFace ZeroGPU
 # ── Configuration ─────────────────────────────────────────────────────────────
+MODEL_ID = "defog/sqlcoder-7b-2"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ── Model Initialization ──────────────────────────────────────────────────────
+print(f"[INFO] Loading 7B Model: {MODEL_ID}")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+    device_map="auto" if torch.cuda.is_available() else None
+)
 model.eval()
+print("[INFO] 7B Model ready.")
+# ── State Management ──────────────────────────────────────────────────────────
+_db_store: dict[str, bytes] = {}
+_schema_store: dict[str, str] = {}
+app = FastAPI(title="QueryMind 7B", version="2.0.0")
 app.add_middleware(
     CORSMiddleware,
     allow_headers=["*"],
 )
 app.mount("/static", StaticFiles(directory="static"), name="static")
 @app.get("/")
 # ── Logic Helpers ──────────────────────────────────────────────────────────────
 def csv_to_sqlite(df: pd.DataFrame, table_name: str) -> bytes:
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
     return db_bytes
 def get_schema(db_bytes: bytes) -> str:
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
     os.unlink(tmp_path)
     return "\n".join(r[0] for r in rows if r[0])
+# ── 7B Inference with ZeroGPU Decorator ──────────────────────────────────────
+@spaces.GPU(duration=60) # <── This is the secret for Free GPU access
 def generate_sql(question: str, schema: str) -> str:
+    """Uses SQLCoder-7B to generate high-accuracy SQL."""
+    # Extract table name for the prompt
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
+    # Prompt format required by SQLCoder
+    prompt = f"""### Task
+Generate a SQL query to answer the question based on the table schema.
+### Schema
+{schema}
+### Question
+{question}
+### SQL
+"""
+    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=200,
+            do_sample=False, # Use greedy decoding for SQL accuracy
+            num_beams=1
+        )
+    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract only the SQL part after the prompt
+    sql = full_output.split("### SQL")[-1].strip()
+    # Basic cleanup
+    sql = sql.split(';')[0].strip() # Take only the first statement
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
     except Exception as e:
         conn.close()
         os.unlink(tmp_path)
+        raise HTTPException(status_code=400, detail=f"SQL Error: {e}")
     conn.close()
     os.unlink(tmp_path)
     return rows
         raise HTTPException(status_code=400, detail="Only CSV files accepted.")
     contents = await file.read()
+    df = pd.read_csv(io.BytesIO(contents))
     session_id = os.urandom(8).hex()
     raw_name = os.path.splitext(file.filename)[0]
     if table_name[0].isdigit(): table_name = "t_" + table_name
     db_bytes = csv_to_sqlite(df, table_name)
     _db_store[session_id] = db_bytes
+    _schema_store[session_id] = get_schema(db_bytes)
     return JSONResponse({
         "session_id": session_id,
         "columns": list(df.columns),
         "row_count": len(df),
         "preview": df.head(5).to_dict(orient="records"),
+        "schema": _schema_store[session_id],
     })
 @app.post("/query")
 async def query(req: QueryRequest):
     if req.session_id not in _db_store:
+        raise HTTPException(status_code=404, detail="Session expired.")
     schema = _schema_store[req.session_id]
     sql = generate_sql(req.question, schema)
 @app.get("/health")
 def health():
+    return {"status": "ok", "model": MODEL_ID, "device": DEVICE}