Spaces:

DearmonAnalytics
/

SQL_INTRO

Sleeping

App Files Files Community

jtdearmon commited on Sep 5, 2025

Commit

af6b0e1

verified ·

1 Parent(s): 4909489

Update app.py

Browse files

Files changed (1) hide show

app.py +199 -36

app.py CHANGED Viewed

@@ -217,7 +217,7 @@ FALLBACK_QUESTIONS = [
      "requires_aliases":False,"required_aliases":[]},
 ]
-# -------------------- OpenAI JSON schema (validated after parse) --------------------
 DOMAIN_AND_QUESTIONS_SCHEMA = {
     "required": ["domain", "tables", "questions"]
 }
@@ -252,7 +252,6 @@ def _loose_json_parse(s: str) -> Optional[dict]:
         return json.loads(s)
     except Exception:
         pass
-    # Try to find the first {...} block
     start = s.find("{")
     end = s.rfind("}")
     if start != -1 and end != -1 and end > start:
@@ -262,20 +261,151 @@ def _loose_json_parse(s: str) -> Optional[dict]:
             return None
     return None
-def llm_generate_domain_and_questions(prev_domain: Optional[str]) -> Tuple[Optional[Dict[str,Any]], Optional[str], Optional[str]]:
     """
-    Returns (obj, error_message, model_used).
-    Uses Chat Completions JSON mode if available; otherwise falls back to strict-instruction parsing.
     """
     if not OPENAI_AVAILABLE or not os.getenv("OPENAI_API_KEY"):
-        return None, "OpenAI client not available or OPENAI_API_KEY missing.", None
     errors = []
     prompt = _domain_prompt(prev_domain)
     for model in _candidate_models():
-        # Try JSON mode first (if supported)
         try:
             try:
                 chat = _client.chat.completions.create(
                     model=model,
@@ -285,7 +415,7 @@ def llm_generate_domain_and_questions(prev_domain: Optional[str]) -> Tuple[Optio
                 )
                 data_text = chat.choices[0].message.content
             except TypeError:
-                # Older SDKs: no response_format argument → plain completion with strict instructions
                 chat = _client.chat.completions.create(
                     model=model,
                     messages=[{"role":"system","content":"Return ONLY a JSON object. No markdown."},
@@ -294,31 +424,49 @@ def llm_generate_domain_and_questions(prev_domain: Optional[str]) -> Tuple[Optio
                 )
                 data_text = chat.choices[0].message.content
-            obj = _loose_json_parse(data_text or "")
-            if not obj:
                 raise RuntimeError("Could not parse JSON from model output.")
-            # Minimal validation
             for k in DOMAIN_AND_QUESTIONS_SCHEMA["required"]:
-                if k not in obj:
                     raise RuntimeError(f"Missing key '{k}'")
-            # Guardrails: strip RIGHT/FULL joins from answers
             clean_qs = []
-            for q in obj.get("questions", []):
-                answers = [a for a in q.get("answer_sql", []) if " right join " not in a.lower() and " full " not in a.lower()]
                 if not answers:
                     continue
-                q["answer_sql"] = answers
-                q.setdefault("requires_aliases", False)
-                q.setdefault("required_aliases", [])
-                clean_qs.append(q)
-            obj["questions"] = clean_qs
-            return obj, None, model
         except Exception as e:
             errors.append(f"{model}: {e}")
             continue
-    return None, "; ".join(errors) if errors else "Unknown LLM error.", None
 # -------------------- Schema install & question handling --------------------
 def drop_existing_domain_tables(con: sqlite3.Connection, keep_internal=True):
@@ -465,18 +613,22 @@ def load_questions(obj_list: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
 # -------------------- Domain bootstrap --------------------
 def bootstrap_domain_with_llm_or_fallback(prev_domain: Optional[str]):
-    obj, err, model_used = llm_generate_domain_and_questions(prev_domain)
     if obj is None:
-        return FALLBACK_SCHEMA, FALLBACK_QUESTIONS, {"source":"fallback","model":None,"error":err}
-    return obj, obj["questions"], {"source":"openai","model":model_used,"error":None}
-def install_new_domain(prev_domain: Optional[str]):
     schema, questions, info = bootstrap_domain_with_llm_or_fallback(prev_domain)
     install_schema(CONN, schema)
     return schema, questions, info
 # -------------------- Session state --------------------
-CURRENT_SCHEMA, CURRENT_QS, CURRENT_INFO = install_new_domain(prev_domain=None)
 # -------------------- Progress + mastery --------------------
 def upsert_user(con: sqlite3.Connection, user_id: str, name: str):
@@ -510,11 +662,13 @@ def fetch_attempts(con: sqlite3.Connection, user_id: str) -> pd.DataFrame:
         return pd.read_sql_query("SELECT * FROM attempts WHERE user_id=? ORDER BY id DESC", con, params=(user_id,))
 def pick_next_question(user_id: str) -> Dict[str,Any]:
     df = fetch_attempts(CONN, user_id)
     stats = topic_stats(df)
     stats = stats.sort_values(by=["accuracy","attempts"], ascending=[True, True])
     weakest = stats.iloc[0]["category"] if not stats.empty else CATEGORIES_ORDER[0]
-    cands = [q for q in CURRENT_QS if q["category"] == weakest] or CURRENT_QS
     return dict(random.choice(cands))
 # -------------------- Execution & feedback --------------------
@@ -679,7 +833,7 @@ def submit_answer(sql_text: str, session: dict):
     if err:
         fb = f"❌ **Did not run**\n\n{err}"
         if details: fb += "\n\n" + "\n".join(details)
-        log_attempt(user_id, q["id"], q["category"], False, sql_text, elapsed, int(q["difficulty"]), "bank", " | ".join([err] + details))
         stats = topic_stats(fetch_attempts(CONN, user_id))
         return gr.update(value=fb, visible=True), pd.DataFrame(), gr.update(visible=True), stats
@@ -698,7 +852,7 @@ def submit_answer(sql_text: str, session: dict):
         feedback += "\n\n" + "\n".join(details)
     feedback += "\n\n" + explanation + "\n\n**One acceptable solution:**\n```sql\n" + q["answer_sql"][0].rstrip(";") + ";\n```"
-    log_attempt(user_id, q["id"], q["category"], bool(is_correct), sql_text, elapsed, int(q["difficulty"]), "bank", " | ".join(details))
     stats = topic_stats(fetch_attempts(CONN, user_id))
     return gr.update(value=feedback, visible=True), (df if df is not None else pd.DataFrame()), gr.update(visible=True), stats
@@ -715,7 +869,7 @@ def next_question(session: dict):
 def show_hint(session: dict):
     if not session or "q" not in session:
         return gr.update(value="Start a session first.", visible=True)
-    cat = session["q"]["category"]
     hint = {
         "SELECT *": "Use `SELECT * FROM table_name`.",
         "SELECT columns": "List columns: `SELECT col1, col2 FROM table_name`.",
@@ -734,16 +888,25 @@ def export_progress(user_name: str):
     if not slug:
         return None
     user_id = slug[:64]
-    df = fetch_attempts(CONN, user_id)
     os.makedirs(EXPORT_DIR, exist_ok=True)
     path = os.path.abspath(os.path.join(EXPORT_DIR, f"{user_id}_progress.csv"))
     (pd.DataFrame([{"info":"No attempts yet."}]) if df.empty else df).to_csv(path, index=False)
     return path
 def _domain_status_md():
-    if CURRENT_INFO.get("source") == "openai":
-        return f"✅ **Domain regenerated via OpenAI** (`{CURRENT_INFO.get('model','?')}`) → **{CURRENT_SCHEMA.get('domain','?')}**. " \
-               f"Tables: {', '.join(t['name'] for t in CURRENT_SCHEMA.get('tables', []))}."
     err = CURRENT_INFO.get("error","")
     err_short = (err[:160] + "…") if len(err) > 160 else err
     return f"⚠️ **OpenAI randomization unavailable** → using fallback **{CURRENT_SCHEMA.get('domain','?')}**.\n\n> Reason: {err_short}"
@@ -751,7 +914,7 @@ def _domain_status_md():
 def regenerate_domain():
     global CURRENT_SCHEMA, CURRENT_QS, CURRENT_INFO
     prev = CURRENT_SCHEMA.get("domain") if CURRENT_SCHEMA else None
-    CURRENT_SCHEMA, CURRENT_QS, CURRENT_INFO = install_new_domain(prev_domain=prev)
     return gr.update(value=_domain_status_md(), visible=True)
 def preview_table(tbl: str):

      "requires_aliases":False,"required_aliases":[]},
 ]
+# -------------------- OpenAI JSON request helpers --------------------
 DOMAIN_AND_QUESTIONS_SCHEMA = {
     "required": ["domain", "tables", "questions"]
 }
         return json.loads(s)
     except Exception:
         pass
     start = s.find("{")
     end = s.rfind("}")
     if start != -1 and end != -1 and end > start:
             return None
     return None
+# -------------------- Canonicalization & validation --------------------
+_SQL_FENCE = re.compile(r"```sql(.*?)```", re.IGNORECASE | re.DOTALL)
+_CODE_FENCE = re.compile(r"```(.*?)```", re.DOTALL)
+def _strip_code_fences(txt: str) -> str:
+    if txt is None:
+        return ""
+    m = _SQL_FENCE.findall(txt)
+    if m:
+        return "\n".join([x.strip() for x in m if x.strip()])
+    m2 = _CODE_FENCE.findall(txt)
+    if m2:
+        return "\n".join([x.strip() for x in m2 if x.strip()])
+    return txt.strip()
+def _as_list_of_sql(val) -> List[str]:
+    if val is None:
+        return []
+    if isinstance(val, str):
+        s = _strip_code_fences(val)
+        parts = [p.strip() for p in s.split("\n") if p.strip()]
+        # if it’s a single long line, keep as is
+        return parts or ([s] if s else [])
+    if isinstance(val, list):
+        out = []
+        for v in val:
+            if isinstance(v, str):
+                s = _strip_code_fences(v)
+                if s:
+                    out.append(s)
+        return out
+    return []
+def _canon_question(q: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """Normalize one question; return None if missing critical fields."""
+    if not isinstance(q, dict):
+        return None
+    # field mapping/synonyms
+    cat = q.get("category") or q.get("type") or q.get("topic")
+    prompt = q.get("prompt_md") or q.get("prompt") or q.get("question") or q.get("text")
+    answer_sql = q.get("answer_sql") or q.get("answers") or q.get("solutions") or q.get("sql")
+    diff = q.get("difficulty") or 1
+    req_alias = bool(q.get("requires_aliases", False))
+    req_aliases = q.get("required_aliases") or []
+    cat = str(cat).strip() if cat is not None else ""
+    prompt = str(prompt).strip() if prompt is not None else ""
+    answers = _as_list_of_sql(answer_sql)
+    if not cat or not prompt or not answers:
+        return None
+    # keep only known categories if provided; otherwise accept free text
+    known = {
+        "SELECT *","SELECT columns","WHERE","Aliases",
+        "JOIN (INNER)","JOIN (LEFT)","Aggregation","VIEW","CTAS / SELECT INTO"
+    }
+    if cat not in known:
+        # Try to map rough names to our set
+        low = cat.lower()
+        if "select *" in low:
+            cat = "SELECT *"
+        elif "select col" in low or "columns" in low:
+            cat = "SELECT columns"
+        elif "where" in low or "filter" in low:
+            cat = "WHERE"
+        elif "alias" in low:
+            cat = "Aliases"
+        elif "left" in low:
+            cat = "JOIN (LEFT)"
+        elif "inner" in low or "join" in low:
+            cat = "JOIN (INNER)"
+        elif "agg" in low or "group" in low:
+            cat = "Aggregation"
+        elif "view" in low:
+            cat = "VIEW"
+        elif "into" in low or "ctas" in low or "create table" in low:
+            cat = "CTAS / SELECT INTO"
+        else:
+            # leave as-is; still usable for practice buckets
+            pass
+    # normalize aliases list
+    if isinstance(req_aliases, str):
+        req_aliases = [a.strip() for a in re.split(r"[,\s]+", req_aliases) if a.strip()]
+    elif not isinstance(req_aliases, list):
+        req_aliases = []
+    return {
+        "id": str(q.get("id") or f"LLM_{int(time.time()*1000)}_{random.randint(100,999)}"),
+        "category": cat,
+        "difficulty": int(diff) if str(diff).isdigit() else 1,
+        "prompt_md": prompt,
+        "answer_sql": answers,
+        "requires_aliases": bool(req_alias),
+        "required_aliases": req_aliases,
+    }
+def _canon_tables(tables: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    out = []
+    for t in (tables or []):
+        if not isinstance(t, dict):
+            continue
+        name = str(t.get("name","")).strip()
+        if not name:
+            continue
+        cols = t.get("columns") or []
+        good_cols = []
+        for c in cols:
+            if not isinstance(c, dict):
+                continue
+            cname = str(c.get("name","")).strip()
+            ctype = str(c.get("type","TEXT")).strip() or "TEXT"
+            if cname:
+                good_cols.append({"name": cname, "type": ctype})
+        if not good_cols:
+            continue
+        pk = t.get("pk") or []
+        if isinstance(pk, str): pk = [pk]
+        fks = t.get("fks") or []
+        rows = t.get("rows") or []
+        out.append({
+            "name": name,
+            "pk": [str(x) for x in pk],
+            "columns": good_cols,
+            "fks": fks if isinstance(fks, list) else [],
+            "rows": rows if isinstance(rows, list) else [],
+        })
+    return out
+# -------------------- LLM call --------------------
+def llm_generate_domain_and_questions(prev_domain: Optional[str]) -> Tuple[Optional[Dict[str,Any]], Optional[str], Optional[str], Dict[str,int]]:
     """
+    Returns (obj, error_message, model_used, stats_dict).
+    stats_dict contains {"accepted_questions": n, "dropped_questions": m}
     """
     if not OPENAI_AVAILABLE or not os.getenv("OPENAI_API_KEY"):
+        return None, "OpenAI client not available or OPENAI_API_KEY missing.", None, {"accepted_questions":0,"dropped_questions":0}
     errors = []
     prompt = _domain_prompt(prev_domain)
     for model in _candidate_models():
         try:
+            # Try JSON mode first
             try:
                 chat = _client.chat.completions.create(
                     model=model,
                 )
                 data_text = chat.choices[0].message.content
             except TypeError:
+                # Older SDKs: no response_format ⇒ plain chat + strict instructions
                 chat = _client.chat.completions.create(
                     model=model,
                     messages=[{"role":"system","content":"Return ONLY a JSON object. No markdown."},
                 )
                 data_text = chat.choices[0].message.content
+            obj_raw = _loose_json_parse(data_text or "")
+            if not obj_raw:
                 raise RuntimeError("Could not parse JSON from model output.")
+            # Minimal top-level validation
             for k in DOMAIN_AND_QUESTIONS_SCHEMA["required"]:
+                if k not in obj_raw:
                     raise RuntimeError(f"Missing key '{k}'")
+            # Canonicalize tables
+            tables = _canon_tables(obj_raw.get("tables", []))
+            if not tables:
+                raise RuntimeError("No usable tables in LLM output.")
+            obj_raw["tables"] = tables
+            # Canonicalize questions
+            dropped = 0
             clean_qs = []
+            for q in obj_raw.get("questions", []):
+                cq = _canon_question(q)
+                if not cq:
+                    dropped += 1
+                    continue
+                # Strip RIGHT/FULL joins from answers
+                answers = [a for a in cq["answer_sql"] if " right join " not in a.lower() and " full " not in a.lower()]
                 if not answers:
+                    dropped += 1
                     continue
+                cq["answer_sql"] = answers
+                clean_qs.append(cq)
+            if not clean_qs:
+                raise RuntimeError("No usable questions after canonicalization.")
+            stats = {"accepted_questions": len(clean_qs), "dropped_questions": dropped}
+            obj_raw["questions"] = clean_qs
+            return obj_raw, None, model, stats
         except Exception as e:
             errors.append(f"{model}: {e}")
             continue
+    return None, "; ".join(errors) if errors else "Unknown LLM error.", None, {"accepted_questions":0,"dropped_questions":0}
 # -------------------- Schema install & question handling --------------------
 def drop_existing_domain_tables(con: sqlite3.Connection, keep_internal=True):
 # -------------------- Domain bootstrap --------------------
 def bootstrap_domain_with_llm_or_fallback(prev_domain: Optional[str]):
+    obj, err, model_used, stats = llm_generate_domain_and_questions(prev_domain)
     if obj is None:
+        return FALLBACK_SCHEMA, FALLBACK_QUESTIONS, {"source":"fallback","model":None,"error":err,"accepted":0,"dropped":0}
+    return obj, obj["questions"], {"source":"openai","model":model_used,"error":None,"accepted":stats["accepted_questions"],"dropped":stats["dropped_questions"]}
+def install_schema_and_prepare_questions(prev_domain: Optional[str]):
     schema, questions, info = bootstrap_domain_with_llm_or_fallback(prev_domain)
     install_schema(CONN, schema)
+    # Safety: if questions empty, fall back
+    if not questions:
+        questions = FALLBACK_QUESTIONS
+        info = {"source":"openai+fallback-questions","model":info.get("model"),"error":"LLM returned 0 usable questions; used fallback bank.","accepted":0,"dropped":0}
     return schema, questions, info
 # -------------------- Session state --------------------
+CURRENT_SCHEMA, CURRENT_QS, CURRENT_INFO = install_schema_and_prepare_questions(prev_domain=None)
 # -------------------- Progress + mastery --------------------
 def upsert_user(con: sqlite3.Connection, user_id: str, name: str):
         return pd.read_sql_query("SELECT * FROM attempts WHERE user_id=? ORDER BY id DESC", con, params=(user_id,))
 def pick_next_question(user_id: str) -> Dict[str,Any]:
+    # Defensive: ensure we always have a pool
+    pool = CURRENT_QS if CURRENT_QS else FALLBACK_QUESTIONS
     df = fetch_attempts(CONN, user_id)
     stats = topic_stats(df)
     stats = stats.sort_values(by=["accuracy","attempts"], ascending=[True, True])
     weakest = stats.iloc[0]["category"] if not stats.empty else CATEGORIES_ORDER[0]
+    cands = [q for q in pool if str(q.get("category","")).strip() == weakest] or pool
     return dict(random.choice(cands))
 # -------------------- Execution & feedback --------------------
     if err:
         fb = f"❌ **Did not run**\n\n{err}"
         if details: fb += "\n\n" + "\n".join(details)
+        log_attempt(user_id, q.get("id","?"), q.get("category","?"), False, sql_text, elapsed, int(q.get("difficulty",1)), "bank", " | ".join([err] + details))
         stats = topic_stats(fetch_attempts(CONN, user_id))
         return gr.update(value=fb, visible=True), pd.DataFrame(), gr.update(visible=True), stats
         feedback += "\n\n" + "\n".join(details)
     feedback += "\n\n" + explanation + "\n\n**One acceptable solution:**\n```sql\n" + q["answer_sql"][0].rstrip(";") + ";\n```"
+    log_attempt(user_id, q["id"], q.get("category","?"), bool(is_correct), sql_text, elapsed, int(q.get("difficulty",1)), "bank", " | ".join(details))
     stats = topic_stats(fetch_attempts(CONN, user_id))
     return gr.update(value=feedback, visible=True), (df if df is not None else pd.DataFrame()), gr.update(visible=True), stats
 def show_hint(session: dict):
     if not session or "q" not in session:
         return gr.update(value="Start a session first.", visible=True)
+    cat = session["q"].get("category","?")
     hint = {
         "SELECT *": "Use `SELECT * FROM table_name`.",
         "SELECT columns": "List columns: `SELECT col1, col2 FROM table_name`.",
     if not slug:
         return None
     user_id = slug[:64]
+    with DB_LOCK:
+        df = pd.read_sql_query("SELECT * FROM attempts WHERE user_id=? ORDER BY id DESC", CONN, params=(user_id,))
     os.makedirs(EXPORT_DIR, exist_ok=True)
     path = os.path.abspath(os.path.join(EXPORT_DIR, f"{user_id}_progress.csv"))
     (pd.DataFrame([{"info":"No attempts yet."}]) if df.empty else df).to_csv(path, index=False)
     return path
 def _domain_status_md():
+    if CURRENT_INFO.get("source","") in ("openai","openai+fallback-questions"):
+        note = ""
+        if CURRENT_INFO.get("source") == "openai+fallback-questions":
+            note = " (LLM domain ok; used fallback questions)"
+        accepted = CURRENT_INFO.get("accepted",0)
+        dropped = CURRENT_INFO.get("dropped",0)
+        return (
+            f"✅ **Domain via OpenAI** `{CURRENT_INFO.get('model','?')}` → **{CURRENT_SCHEMA.get('domain','?')}**{note}. "
+            f"Accepted questions: {accepted}, dropped: {dropped}.  \n"
+            f"Tables: {', '.join(t['name'] for t in CURRENT_SCHEMA.get('tables', []))}."
+        )
     err = CURRENT_INFO.get("error","")
     err_short = (err[:160] + "…") if len(err) > 160 else err
     return f"⚠️ **OpenAI randomization unavailable** → using fallback **{CURRENT_SCHEMA.get('domain','?')}**.\n\n> Reason: {err_short}"
 def regenerate_domain():
     global CURRENT_SCHEMA, CURRENT_QS, CURRENT_INFO
     prev = CURRENT_SCHEMA.get("domain") if CURRENT_SCHEMA else None
+    CURRENT_SCHEMA, CURRENT_QS, CURRENT_INFO = install_schema_and_prepare_questions(prev_domain=prev)
     return gr.update(value=_domain_status_md(), visible=True)
 def preview_table(tbl: str):