Spaces:

nilotpaldhar2004
/

text2sql-chatbot

Sleeping

App Files Files Community

nilotpaldhar2004 commited on 23 days ago

Commit

822614c

verified ·

1 Parent(s): bf7ba46

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -22

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
-QueryMind — CSV-to-SQL Engine
 Model: T5-Small Hybrid (Regex + Transformer)
-Target Hardware: HuggingFace Free Tier (CPU)
 """
 import os
@@ -26,13 +26,14 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # ── Model Initialization ──────────────────────────────────────────────────────
 print(f"[INFO] Loading model: {MODEL_NAME} | device: {DEVICE}")
-# Force use_fast=False to avoid the sentencepiece backend error
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
 print("[INFO] Model ready.")
-# ── State Management ──────────────────────────────────────────────────────────
 _db_store: dict[str, bytes] = {}   # session_id -> sqlite db bytes
 _schema_store: dict[str, str] = {} # session_id -> create table schema
@@ -54,11 +55,10 @@ def root():
 # ── Logic Helpers ──────────────────────────────────────────────────────────────
 def csv_to_sqlite(df: pd.DataFrame, table_name: str) -> bytes:
-    """Safely converts a Pandas DataFrame into a SQLite binary blob."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
-    # Ensure the table name is safe for SQL
     safe_table = re.sub(r"[^a-zA-Z0-9_]", "_", table_name)
     df.to_sql(safe_table, conn, if_exists="replace", index=False)
     conn.close()
@@ -68,7 +68,7 @@ def csv_to_sqlite(df: pd.DataFrame, table_name: str) -> bytes:
     return db_bytes
 def get_schema(db_bytes: bytes) -> str:
-    """Extracts the exact SQL schema used to create the SQLite table."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
@@ -81,8 +81,11 @@ def get_schema(db_bytes: bytes) -> str:
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
-    """Hybrid Engine: Uses smart regex first, falls back to T5 with sanitization."""
-    # 1. Schema Context
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
@@ -90,14 +93,14 @@ def generate_sql(question: str, schema: str) -> str:
     q = question.lower().strip()
-    # 2. Smart Column Detection (Matches user words to schema)
     target_col = None
     for col in col_match:
         if col.lower() in q:
             target_col = col
             break
-    # 3. Deterministic Regex Layer (High Accuracy, Zero Latency)
     # DISTINCT/UNIQUE
     if re.search(r'unique|distinct', q):
@@ -114,20 +117,19 @@ def generate_sql(question: str, schema: str) -> str:
         num_col = target_col if target_col else next((c for c in col_match if re.search(r'pm|aqi|no|co|so|o3|benzene|val|amt', c, re.I)), col_match[0])
         return f'SELECT AVG("{num_col}") FROM {quoted}'
-    # COUNT/HOW MANY
     if re.search(r'count|total|how many', q):
-        # Handle word searches (e.g. "count Paris")
         if target_col and len(q.split()) > 2:
             return f'SELECT COUNT(*) FROM {quoted} WHERE "{target_col}" LIKE "%{q.split()[-1]}%"'
         return f'SELECT COUNT(*) FROM {quoted}'
-    # LIMIT/TOP
     if re.search(r'show|display|get|first|top', q):
         n_match = re.search(r'\d+', q)
         limit = n_match.group() if n_match else 10
         return f'SELECT * FROM {quoted} LIMIT {limit}'
-    # 4. Transformer Fallback (Probabilistic Reasoning)
     col_hint = ", ".join(col_match) if col_match else ""
     prompt = f"Translate English to SQL: {question} | Table: {table_name} | Columns: {col_hint}"
@@ -137,23 +139,19 @@ def generate_sql(question: str, schema: str) -> str:
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-    # ── Output Sanitization Guardrails ──
-    # Remove T5 artifacts (pipes, prompt echoes)
     if "|" in sql: sql = sql.split("|")[-1].strip()
     sql = re.sub(r'^(sql|query|table):', '', sql, flags=re.IGNORECASE).strip()
-    # Force correct table references
     sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
     sql = re.sub(r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|ON|AND|OR)(\w+)', r'\1', sql, flags=re.IGNORECASE)
-    # Final check for valid SELECT
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
-    """Runs SQL against the binary blob by creating a temporary local SQLite DB."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
@@ -187,7 +185,6 @@ async def upload_csv(file: UploadFile = File(...)):
         raise HTTPException(status_code=400, detail=f"CSV parse error: {e}")
     session_id = os.urandom(8).hex()
-    # Clean the filename to create a valid SQLite table name
     raw_name = os.path.splitext(file.filename)[0]
     table_name = re.sub(r"[^a-zA-Z0-9_]", "_", raw_name)[:32] or "data"
     if table_name[0].isdigit(): table_name = "t_" + table_name

 """
+QueryMind — CSV-to-SQL Engine (Final Production Version)
 Model: T5-Small Hybrid (Regex + Transformer)
+Hardware: HuggingFace Free Tier (CPU)
 """
 import os
 # ── Model Initialization ──────────────────────────────────────────────────────
 print(f"[INFO] Loading model: {MODEL_NAME} | device: {DEVICE}")
+# CRITICAL: use_fast=False fixes the sentencepiece/backend tokenizer error on CPU
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
 model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
 print("[INFO] Model ready.")
+# ── State Management (In-Memory) ──────────────────────────────────────────────
 _db_store: dict[str, bytes] = {}   # session_id -> sqlite db bytes
 _schema_store: dict[str, str] = {} # session_id -> create table schema
 # ── Logic Helpers ──────────────────────────────────────────────────────────────
 def csv_to_sqlite(df: pd.DataFrame, table_name: str) -> bytes:
+    """Converts Pandas DataFrame into a portable SQLite binary blob."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp_path = tmp.name
     conn = sqlite3.connect(tmp_path)
     safe_table = re.sub(r"[^a-zA-Z0-9_]", "_", table_name)
     df.to_sql(safe_table, conn, if_exists="replace", index=False)
     conn.close()
     return db_bytes
 def get_schema(db_bytes: bytes) -> str:
+    """Extracts the SQL schema used to create the table."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
     return "\n".join(r[0] for r in rows if r[0])
 def generate_sql(question: str, schema: str) -> str:
+    """
+    Dual-Stream SQL Generation:
+    1. Deterministic (Regex) - Matches common analysis patterns.
+    2. Probabilistic (T5) - Handles complex phrasing as fallback.
+    """
     table_match = re.search(r'CREATE TABLE\s+"?(\w+)"?', schema, re.IGNORECASE)
     table_name = table_match.group(1) if table_match else "data"
     quoted = f'"{table_name}"'
     q = question.lower().strip()
+    # Smart Column Detection
     target_col = None
     for col in col_match:
         if col.lower() in q:
             target_col = col
             break
+    # ── Deterministic Layer ──
     # DISTINCT/UNIQUE
     if re.search(r'unique|distinct', q):
         num_col = target_col if target_col else next((c for c in col_match if re.search(r'pm|aqi|no|co|so|o3|benzene|val|amt', c, re.I)), col_match[0])
         return f'SELECT AVG("{num_col}") FROM {quoted}'
+    # COUNT
     if re.search(r'count|total|how many', q):
         if target_col and len(q.split()) > 2:
             return f'SELECT COUNT(*) FROM {quoted} WHERE "{target_col}" LIKE "%{q.split()[-1]}%"'
         return f'SELECT COUNT(*) FROM {quoted}'
+    # LIMIT
     if re.search(r'show|display|get|first|top', q):
         n_match = re.search(r'\d+', q)
         limit = n_match.group() if n_match else 10
         return f'SELECT * FROM {quoted} LIMIT {limit}'
+    # ── Probabilistic Fallback ──
     col_hint = ", ".join(col_match) if col_match else ""
     prompt = f"Translate English to SQL: {question} | Table: {table_name} | Columns: {col_hint}"
     sql = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+    # Output Sanitization
     if "|" in sql: sql = sql.split("|")[-1].strip()
     sql = re.sub(r'^(sql|query|table):', '', sql, flags=re.IGNORECASE).strip()
     sql = re.sub(r'\bFROM\s+("?\w+"?)', f'FROM {quoted}', sql, flags=re.IGNORECASE)
     sql = re.sub(r'(FROM\s+"?\w+"?)\s+(?!WHERE|LIMIT|ORDER|GROUP|HAVING|JOIN|ON|AND|OR)(\w+)', r'\1', sql, flags=re.IGNORECASE)
     if not re.search(r'\bSELECT\b', sql, re.IGNORECASE):
         sql = f'SELECT * FROM {quoted} LIMIT 10'
     return sql
 def execute_sql(sql: str, db_bytes: bytes) -> list[dict]:
+    """Runs SQL against the binary blob via a temporary SQLite instance."""
     with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
         tmp.write(db_bytes)
         tmp_path = tmp.name
         raise HTTPException(status_code=400, detail=f"CSV parse error: {e}")
     session_id = os.urandom(8).hex()
     raw_name = os.path.splitext(file.filename)[0]
     table_name = re.sub(r"[^a-zA-Z0-9_]", "_", raw_name)[:32] or "data"
     if table_name[0].isdigit(): table_name = "t_" + table_name