Spaces:

hparten
/

elementary_mathstudent_chatbot

Paused

App Files Files Community

hparten commited on Oct 19, 2025

Commit

47a20a8

1 Parent(s): e95a1cb

updated logging

Browse files

Files changed (1) hide show

app.py +151 -55

app.py CHANGED Viewed

@@ -1,12 +1,24 @@
 import os
 import csv
 import uuid
 from datetime import datetime
 import torch
 import gradio as gr
 from filelock import FileLock
 from huggingface_hub import HfApi
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from peft import PeftModel
 # =========================
@@ -15,14 +27,15 @@ from peft import PeftModel
 MAX_HISTORY_TURNS = 10
 MAX_PROMPT_TOKENS = 1024
 MAX_NEW_TOKENS = 60
-LOG_DIR = "/tmp/chat_logs"
 os.makedirs(LOG_DIR, exist_ok=True)
 LOCK_PATH = os.path.join(LOG_DIR, ".lock")
 HF_TOKEN = os.environ.get("HF_TOKEN")
-PRIVATE_LOG_REPO = "hparten/math_chat_logs"  # Private dataset repo
-HF_API = HfApi()
 MODEL_ID = "hparten/prob1_qlora_math_student"
@@ -32,7 +45,6 @@ MODEL_ID = "hparten/prob1_qlora_math_student"
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 tokenizer.pad_token = tokenizer.eos_token
 pipe = pipeline(
     "text-generation",
     model=model,
@@ -51,79 +63,161 @@ strategy_explanations = {
 }
 # =========================
-# 🧠 System Prompt
 # =========================
 def build_system_block(problem_prefix, strategy):
     problem_text = "41 plus blank equals 84"
     strat_key = strategy.lower()
-    strat_expl = strategy_explanations.get(strat_key, "Use the named strategy to explain your steps clearly.")
     strategy_tag = f"<strategy_{strat_key}>"
     problem_tag = f"<{problem_prefix.lower()}>"
     system_text = (
         f"<system>\n"
         f"You are the student in a math dialogue.\n"
-        f"PROBLEM: {problem_tag} - {problem_text}\n"
-        f"STRATEGY: {strategy_tag} — {strat_expl}\n"
-        f"When you answer, think step by step, like a student explaining their work out loud.\n"
-        f"Keep your answers short and natural—1 sentence. Let the teacher ask follow-up questions.\n"
-        f"Reply exactly to the teacher questions using <student> ... </student>. Never include any teacher text in your answer.\n"
         f"</system>\n"
     )
     return system_text.strip()
 # =========================
-# 🧾 Logging (Private Upload)
 # =========================
 CSV_HEADERS = ["timestamp", "session_id", "username", "strategy", "teacher", "student"]
-def log_turn(session_id, username, strategy, teacher_msg, student_msg):
-    path = os.path.join(LOG_DIR, f"chat_{session_id}.csv")
-    file_exists = os.path.exists(path)
     with FileLock(LOCK_PATH):
         with open(path, "a", newline="", encoding="utf-8") as f:
-            writer = csv.writer(f)
             if not file_exists:
-                writer.writerow(CSV_HEADERS)
-            writer.writerow([
-                datetime.now().isoformat(timespec="seconds"),
-                session_id,
-                username,
-                strategy,
-                teacher_msg,
-                student_msg,
-            ])
-    # --- Try uploading to private dataset repo ---
-    try:
-        HF_API.upload_file(
-            path_or_fileobj=path,
-            path_in_repo=f"{os.path.basename(path)}",
-            repo_id=PRIVATE_LOG_REPO,
-            repo_type="dataset",
-            token=HF_TOKEN,
-        )
-        print(f"✅ Uploaded log to private dataset: {PRIVATE_LOG_REPO}")
-    except Exception as e:
-        print(f"⚠️ Could not push log: {e}")
 # =========================
-# 🧩 Prompt Builder
 # =========================
 def build_prompt(strategy, history, teacher_question, tokenizer, problem_prefix="Problem_1"):
     base_system_prompt = build_system_block(problem_prefix, strategy)
-    turns = [f"<teacher> {tq} </teacher> <student> {sa} </student>" for tq, sa in history[-MAX_HISTORY_TURNS:]]
     full_prompt = base_system_prompt + "\n" + " ".join(turns)
-    full_prompt += f"<teacher> {teacher_question} </teacher>\n"
     while len(tokenizer.encode(full_prompt, add_special_tokens=False)) > MAX_PROMPT_TOKENS and len(turns) > 0:
         turns.pop(0)
         convo_block = " ".join(turns)
         full_prompt = base_system_prompt + convo_block + f"<teacher> {teacher_question} </teacher>"
     return full_prompt.strip()
 # =========================
 # 🤖 Generation
 # =========================
@@ -133,19 +227,21 @@ def generate_response(teacher_question, username, history, session_id, strategy)
         prompt,
         max_new_tokens=MAX_NEW_TOKENS,
         do_sample=True,
-        temperature=0.5,
         top_p=0.9,
         repetition_penalty=1.05,
         pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
         return_full_text=False,
     )
     out_text = out[0]["generated_text"]
     if "<student>" in out_text and "</student>" in out_text:
         student_reply = out_text.split("<student>", 1)[1].split("</student>", 1)[0].strip()
     else:
         student_reply = out_text.strip()
     history.append((teacher_question, student_reply))
     log_turn(session_id, username, strategy, teacher_question, student_reply)
     return student_reply, history
@@ -164,7 +260,6 @@ def on_send(teacher_question, username, strategy_choice, history, session_id):
     if not teacher_question.strip():
         gr.Warning("Please type a question for the student before sending.")
         return history, history, "", session_id
     student_reply, history = generate_response(
         teacher_question.strip(),
         username.strip(),
@@ -172,15 +267,16 @@ def on_send(teacher_question, username, strategy_choice, history, session_id):
         session_id,
         strategy_choice.lower(),
     )
     msgs = []
     for t, s in history[-MAX_HISTORY_TURNS:]:
         msgs.append({"role": "user", "content": t})
         msgs.append({"role": "assistant", "content": s})
     return msgs, history, "", session_id
 def on_reset():
     return [], [], "", uuid.uuid4().hex[:12]
 # =========================
@@ -194,7 +290,7 @@ with gr.Blocks(title="Elementary Math Student Chatbot") as demo:
     )
     with gr.Row():
-        username = gr.Textbox(label="👤 Your Name", placeholder="Enter your name...")
         strategy_choice = gr.Dropdown(
             ["friendly", "differencing", "subtraction"],
             value="friendly",
@@ -206,20 +302,20 @@ with gr.Blocks(title="Elementary Math Student Chatbot") as demo:
     chat = gr.Chatbot(label="💬 Chat", type="messages")
     state_history = gr.State([])
     state_session = gr.State("")
-    send = gr.Button("Send", variant="primary")
     send.click(
         on_send,
         inputs=[teacher_q, username, strategy_choice, state_history, state_session],
         outputs=[chat, state_history, teacher_q, state_session],
     )
     reset_btn.click(
         on_reset,
         inputs=[],
         outputs=[chat, state_history, teacher_q, state_session],
     )
 if __name__ == "__main__":
     demo.queue()
     demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)

 import os
 import csv
 import uuid
+import time
+import threading
 from datetime import datetime
+from typing import List, Tuple
+import tempfile
+import pandas as pd
 import torch
 import gradio as gr
+from fastapi import Request
+from fastapi.responses import JSONResponse
 from filelock import FileLock
 from huggingface_hub import HfApi
+from datasets import load_dataset, Dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    pipeline,
+)
 from peft import PeftModel
 # =========================
 MAX_HISTORY_TURNS = 10
 MAX_PROMPT_TOKENS = 1024
 MAX_NEW_TOKENS = 60
+INACTIVITY_LIMIT = 600  # 10 minutes
+LOG_DIR = "logs"
 os.makedirs(LOG_DIR, exist_ok=True)
 LOCK_PATH = os.path.join(LOG_DIR, ".lock")
 HF_TOKEN = os.environ.get("HF_TOKEN")
+HF_DATASET_REPO = "hparten/math_chatbot_logs"  # 🔒 must be private
+SPACE_ID = os.environ.get("SPACE_ID")
 MODEL_ID = "hparten/prob1_qlora_math_student"
 model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 tokenizer.pad_token = tokenizer.eos_token
 pipe = pipeline(
     "text-generation",
     model=model,
 }
 # =========================
+# 🧠 Build System Prompt
 # =========================
 def build_system_block(problem_prefix, strategy):
     problem_text = "41 plus blank equals 84"
     strat_key = strategy.lower()
+    strat_expl = strategy_explanations.get(
+        strat_key, "Use the named strategy to explain your steps clearly."
+    )
     strategy_tag = f"<strategy_{strat_key}>"
     problem_tag = f"<{problem_prefix.lower()}>"
     system_text = (
         f"<system>\n"
         f"You are the student in a math dialogue.\n"
+        f"Solving the PROBLEM: {problem_tag} - {problem_text}\n"
+        f"Using the STRATEGY: {strategy_tag} — {strat_expl}\n"
+        f"Return EXACTLY one sentence inside <student> ... </student>."
+        f"Do NOT ask questions or include teacher text.\n"
+        f"Mention the strategy implicity only if natural.\n"
         f"</system>\n"
     )
     return system_text.strip()
 # =========================
+# 🧾 Local CSV (backup)
 # =========================
 CSV_HEADERS = ["timestamp", "session_id", "username", "strategy", "teacher", "student"]
+def _append_csv(path, row):
     with FileLock(LOCK_PATH):
+        file_exists = os.path.exists(path)
         with open(path, "a", newline="", encoding="utf-8") as f:
+            w = csv.writer(f)
             if not file_exists:
+                w.writerow(CSV_HEADERS)
+            w.writerow(row)
+def log_turn(session_id, username, strategy, teacher_msg, student_msg):
+    row = [
+        datetime.now().isoformat(timespec="seconds"),
+        session_id,
+        username,
+        strategy,
+        teacher_msg,
+        student_msg,
+    ]
+    per_session = os.path.join(LOG_DIR, f"chat_{session_id}.csv")
+    _append_csv(per_session, row)
+    add_turn_to_memory(session_id, username, strategy, teacher_msg, student_msg)
+    update_activity(session_id)
 # =========================
+# 🧩 Prompt builder
 # =========================
 def build_prompt(strategy, history, teacher_question, tokenizer, problem_prefix="Problem_1"):
     base_system_prompt = build_system_block(problem_prefix, strategy)
+    turns = [
+        f"<teacher> {tq} </teacher> <student> {sa} </student>"
+        for tq, sa in history[-MAX_HISTORY_TURNS:]
+    ]
     full_prompt = base_system_prompt + "\n" + " ".join(turns)
+    full_prompt += f"<teacher> {teacher_question} </teacher>\n<student>"
     while len(tokenizer.encode(full_prompt, add_special_tokens=False)) > MAX_PROMPT_TOKENS and len(turns) > 0:
         turns.pop(0)
         convo_block = " ".join(turns)
         full_prompt = base_system_prompt + convo_block + f"<teacher> {teacher_question} </teacher>"
     return full_prompt.strip()
+# =========================
+# ❌ Banned Tokens
+# =========================
+def make_bad_words_ids(tokenizer, words: List[str]) -> List[List[int]]:
+    out = []
+    for w in words:
+        if w in tokenizer.all_special_tokens:
+            tid = tokenizer.convert_tokens_to_ids(w)
+            if tid != tokenizer.unk_token_id:
+                out.append([tid])
+        else:
+            toks = tokenizer.encode(w, add_special_tokens=False)
+            if toks:
+                out.append(toks)
+    return out
+bad_words_ids = make_bad_words_ids(
+    tokenizer, ["<teacher>", "</teacher>", "<system>", "</system>", "Teacher:", "teacher:"]
+)
+eos_id = tokenizer.convert_tokens_to_ids("</student>")
+# =========================
+# ☁️ In-memory + Parquet HF Logging
+# =========================
+api = HfApi()
+session_logs = {}      # session_id -> list of rows
+last_activity = {}     # session_id -> last timestamp
+log_lock = threading.Lock()
+def add_turn_to_memory(session_id, username, strategy, teacher_msg, student_msg):
+    row = {
+        "timestamp": datetime.now().isoformat(timespec="seconds"),
+        "session_id": session_id,
+        "username": username,
+        "strategy": strategy,
+        "teacher": teacher_msg,
+        "student": student_msg,
+    }
+    with log_lock:
+        session_logs.setdefault(session_id, []).append(row)
+def update_activity(session_id):
+    last_activity[session_id] = time.time()
+def flush_session_to_hub(session_id):
+    """Append this session to one Parquet file in the private HF dataset."""
+    with log_lock:
+        if session_id not in session_logs or not session_logs[session_id]:
+            return
+        df = pd.DataFrame(session_logs[session_id])
+        del session_logs[session_id]
+    try:
+        ds = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
+        existing = ds.to_pandas()
+        combined = pd.concat([existing, df], ignore_index=True)
+    except Exception:
+        combined = df
+    with tempfile.NamedTemporaryFile("wb", delete=False, suffix=".parquet") as tmp:
+        combined.to_parquet(tmp.name, index=False)
+        tmp_path = tmp.name
+    api.upload_file(
+        path_or_fileobj=tmp_path,
+        path_in_repo="chat_logs.parquet",
+        repo_id=HF_DATASET_REPO,
+        repo_type="dataset",
+        token=HF_TOKEN,
+    )
+    os.remove(tmp_path)
+    print(f"[flush] Uploaded session {session_id} to HF dataset.")
+# =========================
+# ⏰ Inactivity + Tab Close Flush
+# =========================
+def check_inactivity_loop():
+    """Flush sessions inactive >10 min."""
+    while True:
+        now = time.time()
+        inactive = [sid for sid, ts in last_activity.items() if now - ts > INACTIVITY_LIMIT]
+        for sid in inactive:
+            try:
+                flush_session_to_hub(sid)
+                del last_activity[sid]
+            except Exception as e:
+                print(f"[auto-flush-error] {sid}: {e}")
+        time.sleep(60)
+threading.Thread(target=check_inactivity_loop, daemon=True).start()
 # =========================
 # 🤖 Generation
 # =========================
         prompt,
         max_new_tokens=MAX_NEW_TOKENS,
         do_sample=True,
+        temperature=0.4,
         top_p=0.9,
         repetition_penalty=1.05,
+        no_repeat_ngram_size=6,
         pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+        eos_token_id=eos_id,
+        bad_words_ids=bad_words_ids,
         return_full_text=False,
     )
     out_text = out[0]["generated_text"]
     if "<student>" in out_text and "</student>" in out_text:
         student_reply = out_text.split("<student>", 1)[1].split("</student>", 1)[0].strip()
     else:
         student_reply = out_text.strip()
+    student_reply = student_reply.split(".")[0].strip() + "."
     history.append((teacher_question, student_reply))
     log_turn(session_id, username, strategy, teacher_question, student_reply)
     return student_reply, history
     if not teacher_question.strip():
         gr.Warning("Please type a question for the student before sending.")
         return history, history, "", session_id
     student_reply, history = generate_response(
         teacher_question.strip(),
         username.strip(),
         session_id,
         strategy_choice.lower(),
     )
     msgs = []
     for t, s in history[-MAX_HISTORY_TURNS:]:
         msgs.append({"role": "user", "content": t})
         msgs.append({"role": "assistant", "content": s})
     return msgs, history, "", session_id
 def on_reset():
+    """Flush current session before resetting."""
+    if state_session and state_session.value:
+        flush_session_to_hub(state_session.value)
     return [], [], "", uuid.uuid4().hex[:12]
 # =========================
     )
     with gr.Row():
+        username = gr.Textbox(label="👤 Your Name (first last)", placeholder="Enter your name...")
         strategy_choice = gr.Dropdown(
             ["friendly", "differencing", "subtraction"],
             value="friendly",
     chat = gr.Chatbot(label="💬 Chat", type="messages")
     state_history = gr.State([])
     state_session = gr.State("")
+    send = gr.Button("Send", variant="primary")
     send.click(
         on_send,
         inputs=[teacher_q, username, strategy_choice, state_history, state_session],
         outputs=[chat, state_history, teacher_q, state_session],
     )
     reset_btn.click(
         on_reset,
         inputs=[],
         outputs=[chat, state_history, teacher_q, state_session],
     )
 if __name__ == "__main__":
     demo.queue()
     demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)