Spaces:

nilotpaldhar2004
/

text2sql-chatbot

Sleeping

App Files Files Community

nilotpaldhar2004 commited on 24 days ago

Commit

b56ee61

verified ·

1 Parent(s): a02ba1b

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -55

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
-app.py — Model: google/flan-t5-large (Text-to-SQL)
-HuggingFace Space: Free Tier (CPU)
 """
 import os
@@ -18,23 +19,23 @@ from pydantic import BaseModel
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
-# ── Config ────────────────────────────────────────────────────────────────────
-MODEL_NAME = "cssupport/t5-small-awesome-text-to-sql"   # T5-based text→SQL, CPU-friendly
 MAX_NEW_TOKENS = 256
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# ── Load model once at startup ─────────────────────────────────────────────────
-print(f"[INFO] Loading model: {MODEL_NAME}  |  device: {DEVICE}")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
 print("[INFO] Model ready.")
-# ── In-memory DB store ─────────────────────────────────────────────────────────
-_db_store: dict[str, bytes] = {}   # session_id → sqlite db bytes
-_schema_store: dict[str, str] = {} # session_id → schema string
-app = FastAPI(title="CSV-to-SQL Chat", version="1.0.0")
 app.add_middleware(
     CORSMiddleware,
@@ -43,31 +44,30 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# ── Static frontend ────────────────────────────────────────────────────────────
 app.mount("/static", StaticFiles(directory="static"), name="static")
 @app.get("/")
 def root():
     return FileResponse("static/index.html")
-# ── Helpers ────────────────────────────────────────────────────────────────────
-def csv_to_sqlite(df: pd.DataFrame, table_name: str = "data") -> bytes:
-    """Convert DataFrame → SQLite DB bytes."""
-    buf = io.BytesIO()
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
-    df.to_sql(table_name, conn, if_exists="replace", index=False)
     conn.close()
     with open(tmp_path, "rb") as f:
         db_bytes = f.read()
     os.unlink(tmp_path)
     return db_bytes
 def get_schema(db_bytes: bytes) -> str:
-    """Extract CREATE TABLE schema from DB bytes."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
@@ -79,42 +79,54 @@ def get_schema(db_bytes: bytes) -> str:
     os.unlink(tmp_path)
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
-    # 1. Context Extraction (Same as before)
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
     col_match = re.findall(r'"(\w+)"', schema)
     q = question.lower().strip()
-    # 2. Smart Column Detection
     target_col = None
     for col in col_match:
         if col.lower() in q:
             target_col = col
             break
-    # 3. Enhanced Rule-Based Shortcuts (Deterministic)
     if re.search(r'unique|distinct', q):
         col = target_col if target_col else (col_match[0] if col_match else "*")
         return f'SELECT COUNT(DISTINCT "{col}") FROM {quoted}'
     if re.search(r'group.*by|per|each', q):
         col = target_col if target_col else (col_match[0] if col_match else "data")
         return f'SELECT "{col}", COUNT(*) FROM {quoted} GROUP BY "{col}"'
-    if re.search(r'count.*(total|all|record|row|paris)|how many', q):
-        # Special case for "Count the Paris" -> We search for 'Paris' in all columns
-        if "paris" in q:
-            return f'SELECT COUNT(*) FROM {quoted} WHERE "answer" LIKE "%Paris%" OR "question" LIKE "%Paris%"'
         return f'SELECT COUNT(*) FROM {quoted}'
     if re.search(r'show|display|get|first|top', q):
         n_match = re.search(r'\d+', q)
-        return f'SELECT * FROM {quoted} LIMIT {n_match.group() if n_match else 10}'
-    # 4. T5 Model Fallback
     col_hint = ", ".join(col_match) if col_match else ""
     prompt = f"Translate English to SQL: {question} | Table: {table_name} | Columns: {col_hint}"
@@ -124,23 +136,23 @@ def generate_sql(question: str, schema: str) -> str:
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-    # ── CRITICAL CLEANING GUARDRAIL ──
-    # This removes hallucinations like "Table | SQL | Columns" from the output
-    if "|" in sql:
-        sql = sql.split("|")[-1].strip() # Take only the part after the last pipe
-    # Remove common prefix hallucinations
-    sql = re.sub(r'^(sql|query|result|table):', '', sql, flags=re.IGNORECASE).strip()
-    # Force Table and SELECT
     sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
-    """Run SQL against the in-memory SQLite DB."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
@@ -157,18 +169,16 @@ def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
     os.unlink(tmp_path)
     return rows
-# ── Routes ─────────────────────────────────────────────────────────────────────
 class QueryRequest(BaseModel):
     session_id: str
     question: str
 @app.post("/upload")
 async def upload_csv(file: UploadFile = File(...)):
-    """Upload CSV → parse → store as SQLite → return session_id & preview."""
     if not file.filename.endswith(".csv"):
         raise HTTPException(status_code=400, detail="Only CSV files accepted.")
     contents = await file.read()
     try:
         df = pd.read_csv(io.BytesIO(contents))
@@ -176,38 +186,37 @@ async def upload_csv(file: UploadFile = File(...)):
         raise HTTPException(status_code=400, detail=f"CSV parse error: {e}")
     session_id = os.urandom(8).hex()
-    table_name = re.sub(r"[^a-zA-Z0-9_]", "_", os.path.splitext(file.filename)[0])[:32] or "data"
-    if table_name[0].isdigit():
-        table_name = "t_" + table_name
     db_bytes = csv_to_sqlite(df, table_name)
     schema = get_schema(db_bytes)
     _db_store[session_id] = db_bytes
     _schema_store[session_id] = schema
-    preview = df.head(5).to_dict(orient="records")
-    columns = list(df.columns)
     return JSONResponse({
         "session_id": session_id,
         "table_name": table_name,
-        "columns": columns,
         "row_count": len(df),
-        "preview": preview,
         "schema": schema,
     })
 @app.post("/query")
 async def query(req: QueryRequest):
-    """Natural language question → SQL → execute → return results."""
     if req.session_id not in _db_store:
-        raise HTTPException(status_code=404, detail="Session not found. Please upload CSV first.")
     schema = _schema_store[req.session_id]
     sql = generate_sql(req.question, schema)
     results = execute_sql(sql, _db_store[req.session_id])
     return JSONResponse({"sql": sql, "results": results})
 @app.get("/health")
 def health():
-    return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}

 """
+QueryMind — CSV-to-SQL Engine
+Model: T5-Small Hybrid (Regex + Transformer)
+Target Hardware: HuggingFace Free Tier (CPU)
 """
 import os
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
+# ── Configuration ─────────────────────────────────────────────────────────────
+MODEL_NAME = "cssupport/t5-small-awesome-text-to-sql"
 MAX_NEW_TOKENS = 256
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# ── Model Initialization ──────────────────────────────────────────────────────
+print(f"[INFO] Loading model: {MODEL_NAME} | device: {DEVICE}")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
 print("[INFO] Model ready.")
+# ── State Management ──────────────────────────────────────────────────────────
+_db_store: dict[str, bytes] = {}   # session_id -> sqlite db bytes
+_schema_store: dict[str, str] = {} # session_id -> create table schema
+app = FastAPI(title="QueryMind Engine", version="1.1.0")
 app.add_middleware(
     CORSMiddleware,
     allow_headers=["*"],
 )
+# ── Static Frontend ───────────────────────────────────────────────────────────
 app.mount("/static", StaticFiles(directory="static"), name="static")
 @app.get("/")
 def root():
     return FileResponse("static/index.html")
+# ── Logic Helpers ──────────────────────────────────────────────────────────────
+def csv_to_sqlite(df: pd.DataFrame, table_name: str) -> bytes:
+    """Safely converts a Pandas DataFrame into a SQLite binary blob."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
+    # Ensure the table name is safe for SQL
+    safe_table = re.sub(r"[^a-zA-Z0-9_]", "_", table_name)
+    df.to_sql(safe_table, conn, if_exists="replace", index=False)
     conn.close()
     with open(tmp_path, "rb") as f:
         db_bytes = f.read()
     os.unlink(tmp_path)
     return db_bytes
 def get_schema(db_bytes: bytes) -> str:
+    """Extracts the exact SQL schema used to create the SQLite table."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
     os.unlink(tmp_path)
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
+    """Hybrid Engine: Uses smart regex first, falls back to T5 with sanitization."""
+    # 1. Schema Context
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
     col_match = re.findall(r'"(\w+)"', schema)
     q = question.lower().strip()
+    # 2. Smart Column Detection (Matches user words to schema)
     target_col = None
     for col in col_match:
         if col.lower() in q:
             target_col = col
             break
+    # 3. Deterministic Regex Layer (High Accuracy, Zero Latency)
+    # DISTINCT/UNIQUE
     if re.search(r'unique|distinct', q):
         col = target_col if target_col else (col_match[0] if col_match else "*")
         return f'SELECT COUNT(DISTINCT "{col}") FROM {quoted}'
+    # GROUP BY
     if re.search(r'group.*by|per|each', q):
         col = target_col if target_col else (col_match[0] if col_match else "data")
         return f'SELECT "{col}", COUNT(*) FROM {quoted} GROUP BY "{col}"'
+    # AVERAGE
+    if re.search(r'average|mean|avg', q):
+        num_col = target_col if target_col else next((c for c in col_match if re.search(r'pm|aqi|no|co|so|o3|benzene|val|amt', c, re.I)), col_match[0])
+        return f'SELECT AVG("{num_col}") FROM {quoted}'
+    # COUNT/HOW MANY
+    if re.search(r'count|total|how many', q):
+        # Handle word searches (e.g. "count Paris")
+        if target_col and len(q.split()) > 2:
+            return f'SELECT COUNT(*) FROM {quoted} WHERE "{target_col}" LIKE "%{q.split()[-1]}%"'
         return f'SELECT COUNT(*) FROM {quoted}'
+    # LIMIT/TOP
     if re.search(r'show|display|get|first|top', q):
         n_match = re.search(r'\d+', q)
+        limit = n_match.group() if n_match else 10
+        return f'SELECT * FROM {quoted} LIMIT {limit}'
+    # 4. Transformer Fallback (Probabilistic Reasoning)
     col_hint = ", ".join(col_match) if col_match else ""
     prompt = f"Translate English to SQL: {question} | Table: {table_name} | Columns: {col_hint}"
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+    # ── Output Sanitization Guardrails ──
+    # Remove T5 artifacts (pipes, prompt echoes)
+    if "|" in sql: sql = sql.split("|")[-1].strip()
+    sql = re.sub(r'^(sql|query|table):', '', sql, flags=re.IGNORECASE).strip()
+    # Force correct table references
     sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
+    sql = re.sub(r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|ON|AND|OR)(\w+)', r'\1', sql, flags=re.IGNORECASE)
+    # Final check for valid SELECT
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
+    """Runs SQL against the binary blob by creating a temporary local SQLite DB."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
     os.unlink(tmp_path)
     return rows
+# ── API Endpoints ─────────────────────────────────────────────────────────────
 class QueryRequest(BaseModel):
     session_id: str
     question: str
 @app.post("/upload")
 async def upload_csv(file: UploadFile = File(...)):
     if not file.filename.endswith(".csv"):
         raise HTTPException(status_code=400, detail="Only CSV files accepted.")
     contents = await file.read()
     try:
         df = pd.read_csv(io.BytesIO(contents))
         raise HTTPException(status_code=400, detail=f"CSV parse error: {e}")
     session_id = os.urandom(8).hex()
+    # Clean the filename to create a valid SQLite table name
+    raw_name = os.path.splitext(file.filename)[0]
+    table_name = re.sub(r"[^a-zA-Z0-9_]", "_", raw_name)[:32] or "data"
+    if table_name[0].isdigit(): table_name = "t_" + table_name
     db_bytes = csv_to_sqlite(df, table_name)
     schema = get_schema(db_bytes)
     _db_store[session_id] = db_bytes
     _schema_store[session_id] = schema
     return JSONResponse({
         "session_id": session_id,
         "table_name": table_name,
+        "columns": list(df.columns),
         "row_count": len(df),
+        "preview": df.head(5).to_dict(orient="records"),
         "schema": schema,
     })
 @app.post("/query")
 async def query(req: QueryRequest):
     if req.session_id not in _db_store:
+        raise HTTPException(status_code=404, detail="Session expired or not found.")
     schema = _schema_store[req.session_id]
     sql = generate_sql(req.question, schema)
     results = execute_sql(sql, _db_store[req.session_id])
     return JSONResponse({"sql": sql, "results": results})
 @app.get("/health")
 def health():
+    return {"status": "ok", "model": MODEL_NAME, "device": DEVICE}