hparten commited on
Commit
47a20a8
·
1 Parent(s): e95a1cb

updated logging

Browse files
Files changed (1) hide show
  1. app.py +151 -55
app.py CHANGED
@@ -1,12 +1,24 @@
1
  import os
2
  import csv
3
  import uuid
 
 
4
  from datetime import datetime
 
 
 
5
  import torch
6
  import gradio as gr
 
 
7
  from filelock import FileLock
8
  from huggingface_hub import HfApi
9
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
 
 
 
 
10
  from peft import PeftModel
11
 
12
  # =========================
@@ -15,14 +27,15 @@ from peft import PeftModel
15
  MAX_HISTORY_TURNS = 10
16
  MAX_PROMPT_TOKENS = 1024
17
  MAX_NEW_TOKENS = 60
 
18
 
19
- LOG_DIR = "/tmp/chat_logs"
20
  os.makedirs(LOG_DIR, exist_ok=True)
21
  LOCK_PATH = os.path.join(LOG_DIR, ".lock")
22
 
23
  HF_TOKEN = os.environ.get("HF_TOKEN")
24
- PRIVATE_LOG_REPO = "hparten/math_chat_logs" # Private dataset repo
25
- HF_API = HfApi()
26
 
27
  MODEL_ID = "hparten/prob1_qlora_math_student"
28
 
@@ -32,7 +45,6 @@ MODEL_ID = "hparten/prob1_qlora_math_student"
32
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
33
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
34
  tokenizer.pad_token = tokenizer.eos_token
35
-
36
  pipe = pipeline(
37
  "text-generation",
38
  model=model,
@@ -51,79 +63,161 @@ strategy_explanations = {
51
  }
52
 
53
  # =========================
54
- # 🧠 System Prompt
55
  # =========================
56
  def build_system_block(problem_prefix, strategy):
57
  problem_text = "41 plus blank equals 84"
58
  strat_key = strategy.lower()
59
- strat_expl = strategy_explanations.get(strat_key, "Use the named strategy to explain your steps clearly.")
 
 
60
  strategy_tag = f"<strategy_{strat_key}>"
61
  problem_tag = f"<{problem_prefix.lower()}>"
62
-
63
  system_text = (
64
  f"<system>\n"
65
  f"You are the student in a math dialogue.\n"
66
- f"PROBLEM: {problem_tag} - {problem_text}\n"
67
- f"STRATEGY: {strategy_tag} — {strat_expl}\n"
68
- f"When you answer, think step by step, like a student explaining their work out loud.\n"
69
- f"Keep your answers short and natural—1 sentence. Let the teacher ask follow-up questions.\n"
70
- f"Reply exactly to the teacher questions using <student> ... </student>. Never include any teacher text in your answer.\n"
71
  f"</system>\n"
72
  )
73
  return system_text.strip()
74
 
75
  # =========================
76
- # 🧾 Logging (Private Upload)
77
  # =========================
78
  CSV_HEADERS = ["timestamp", "session_id", "username", "strategy", "teacher", "student"]
79
 
80
- def log_turn(session_id, username, strategy, teacher_msg, student_msg):
81
- path = os.path.join(LOG_DIR, f"chat_{session_id}.csv")
82
- file_exists = os.path.exists(path)
83
-
84
  with FileLock(LOCK_PATH):
 
85
  with open(path, "a", newline="", encoding="utf-8") as f:
86
- writer = csv.writer(f)
87
  if not file_exists:
88
- writer.writerow(CSV_HEADERS)
89
- writer.writerow([
90
- datetime.now().isoformat(timespec="seconds"),
91
- session_id,
92
- username,
93
- strategy,
94
- teacher_msg,
95
- student_msg,
96
- ])
97
-
98
- # --- Try uploading to private dataset repo ---
99
- try:
100
- HF_API.upload_file(
101
- path_or_fileobj=path,
102
- path_in_repo=f"{os.path.basename(path)}",
103
- repo_id=PRIVATE_LOG_REPO,
104
- repo_type="dataset",
105
- token=HF_TOKEN,
106
- )
107
- print(f"✅ Uploaded log to private dataset: {PRIVATE_LOG_REPO}")
108
- except Exception as e:
109
- print(f"⚠️ Could not push log: {e}")
110
 
111
  # =========================
112
- # 🧩 Prompt Builder
113
  # =========================
114
  def build_prompt(strategy, history, teacher_question, tokenizer, problem_prefix="Problem_1"):
115
  base_system_prompt = build_system_block(problem_prefix, strategy)
116
- turns = [f"<teacher> {tq} </teacher> <student> {sa} </student>" for tq, sa in history[-MAX_HISTORY_TURNS:]]
 
 
 
117
  full_prompt = base_system_prompt + "\n" + " ".join(turns)
118
- full_prompt += f"<teacher> {teacher_question} </teacher>\n"
119
-
120
  while len(tokenizer.encode(full_prompt, add_special_tokens=False)) > MAX_PROMPT_TOKENS and len(turns) > 0:
121
  turns.pop(0)
122
  convo_block = " ".join(turns)
123
  full_prompt = base_system_prompt + convo_block + f"<teacher> {teacher_question} </teacher>"
124
-
125
  return full_prompt.strip()
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  # =========================
128
  # 🤖 Generation
129
  # =========================
@@ -133,19 +227,21 @@ def generate_response(teacher_question, username, history, session_id, strategy)
133
  prompt,
134
  max_new_tokens=MAX_NEW_TOKENS,
135
  do_sample=True,
136
- temperature=0.5,
137
  top_p=0.9,
138
  repetition_penalty=1.05,
 
139
  pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
 
 
140
  return_full_text=False,
141
  )
142
  out_text = out[0]["generated_text"]
143
-
144
  if "<student>" in out_text and "</student>" in out_text:
145
  student_reply = out_text.split("<student>", 1)[1].split("</student>", 1)[0].strip()
146
  else:
147
  student_reply = out_text.strip()
148
-
149
  history.append((teacher_question, student_reply))
150
  log_turn(session_id, username, strategy, teacher_question, student_reply)
151
  return student_reply, history
@@ -164,7 +260,6 @@ def on_send(teacher_question, username, strategy_choice, history, session_id):
164
  if not teacher_question.strip():
165
  gr.Warning("Please type a question for the student before sending.")
166
  return history, history, "", session_id
167
-
168
  student_reply, history = generate_response(
169
  teacher_question.strip(),
170
  username.strip(),
@@ -172,15 +267,16 @@ def on_send(teacher_question, username, strategy_choice, history, session_id):
172
  session_id,
173
  strategy_choice.lower(),
174
  )
175
-
176
  msgs = []
177
  for t, s in history[-MAX_HISTORY_TURNS:]:
178
  msgs.append({"role": "user", "content": t})
179
  msgs.append({"role": "assistant", "content": s})
180
-
181
  return msgs, history, "", session_id
182
 
183
  def on_reset():
 
 
 
184
  return [], [], "", uuid.uuid4().hex[:12]
185
 
186
  # =========================
@@ -194,7 +290,7 @@ with gr.Blocks(title="Elementary Math Student Chatbot") as demo:
194
  )
195
 
196
  with gr.Row():
197
- username = gr.Textbox(label="👤 Your Name", placeholder="Enter your name...")
198
  strategy_choice = gr.Dropdown(
199
  ["friendly", "differencing", "subtraction"],
200
  value="friendly",
@@ -206,20 +302,20 @@ with gr.Blocks(title="Elementary Math Student Chatbot") as demo:
206
  chat = gr.Chatbot(label="💬 Chat", type="messages")
207
  state_history = gr.State([])
208
  state_session = gr.State("")
209
- send = gr.Button("Send", variant="primary")
210
 
 
211
  send.click(
212
  on_send,
213
  inputs=[teacher_q, username, strategy_choice, state_history, state_session],
214
  outputs=[chat, state_history, teacher_q, state_session],
215
  )
216
-
217
  reset_btn.click(
218
  on_reset,
219
  inputs=[],
220
  outputs=[chat, state_history, teacher_q, state_session],
221
  )
222
 
 
223
  if __name__ == "__main__":
224
  demo.queue()
225
  demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
 
1
  import os
2
  import csv
3
  import uuid
4
+ import time
5
+ import threading
6
  from datetime import datetime
7
+ from typing import List, Tuple
8
+ import tempfile
9
+ import pandas as pd
10
  import torch
11
  import gradio as gr
12
+ from fastapi import Request
13
+ from fastapi.responses import JSONResponse
14
  from filelock import FileLock
15
  from huggingface_hub import HfApi
16
+ from datasets import load_dataset, Dataset
17
+ from transformers import (
18
+ AutoTokenizer,
19
+ AutoModelForCausalLM,
20
+ pipeline,
21
+ )
22
  from peft import PeftModel
23
 
24
  # =========================
 
27
  MAX_HISTORY_TURNS = 10
28
  MAX_PROMPT_TOKENS = 1024
29
  MAX_NEW_TOKENS = 60
30
+ INACTIVITY_LIMIT = 600 # 10 minutes
31
 
32
+ LOG_DIR = "logs"
33
  os.makedirs(LOG_DIR, exist_ok=True)
34
  LOCK_PATH = os.path.join(LOG_DIR, ".lock")
35
 
36
  HF_TOKEN = os.environ.get("HF_TOKEN")
37
+ HF_DATASET_REPO = "hparten/math_chatbot_logs" # 🔒 must be private
38
+ SPACE_ID = os.environ.get("SPACE_ID")
39
 
40
  MODEL_ID = "hparten/prob1_qlora_math_student"
41
 
 
45
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
46
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
47
  tokenizer.pad_token = tokenizer.eos_token
 
48
  pipe = pipeline(
49
  "text-generation",
50
  model=model,
 
63
  }
64
 
65
  # =========================
66
+ # 🧠 Build System Prompt
67
  # =========================
68
  def build_system_block(problem_prefix, strategy):
69
  problem_text = "41 plus blank equals 84"
70
  strat_key = strategy.lower()
71
+ strat_expl = strategy_explanations.get(
72
+ strat_key, "Use the named strategy to explain your steps clearly."
73
+ )
74
  strategy_tag = f"<strategy_{strat_key}>"
75
  problem_tag = f"<{problem_prefix.lower()}>"
 
76
  system_text = (
77
  f"<system>\n"
78
  f"You are the student in a math dialogue.\n"
79
+ f"Solving the PROBLEM: {problem_tag} - {problem_text}\n"
80
+ f"Using the STRATEGY: {strategy_tag} — {strat_expl}\n"
81
+ f"Return EXACTLY one sentence inside <student> ... </student>."
82
+ f"Do NOT ask questions or include teacher text.\n"
83
+ f"Mention the strategy implicity only if natural.\n"
84
  f"</system>\n"
85
  )
86
  return system_text.strip()
87
 
88
  # =========================
89
+ # 🧾 Local CSV (backup)
90
  # =========================
91
  CSV_HEADERS = ["timestamp", "session_id", "username", "strategy", "teacher", "student"]
92
 
93
+ def _append_csv(path, row):
 
 
 
94
  with FileLock(LOCK_PATH):
95
+ file_exists = os.path.exists(path)
96
  with open(path, "a", newline="", encoding="utf-8") as f:
97
+ w = csv.writer(f)
98
  if not file_exists:
99
+ w.writerow(CSV_HEADERS)
100
+ w.writerow(row)
101
+
102
+ def log_turn(session_id, username, strategy, teacher_msg, student_msg):
103
+ row = [
104
+ datetime.now().isoformat(timespec="seconds"),
105
+ session_id,
106
+ username,
107
+ strategy,
108
+ teacher_msg,
109
+ student_msg,
110
+ ]
111
+ per_session = os.path.join(LOG_DIR, f"chat_{session_id}.csv")
112
+ _append_csv(per_session, row)
113
+ add_turn_to_memory(session_id, username, strategy, teacher_msg, student_msg)
114
+ update_activity(session_id)
 
 
 
 
 
 
115
 
116
  # =========================
117
+ # 🧩 Prompt builder
118
  # =========================
119
  def build_prompt(strategy, history, teacher_question, tokenizer, problem_prefix="Problem_1"):
120
  base_system_prompt = build_system_block(problem_prefix, strategy)
121
+ turns = [
122
+ f"<teacher> {tq} </teacher> <student> {sa} </student>"
123
+ for tq, sa in history[-MAX_HISTORY_TURNS:]
124
+ ]
125
  full_prompt = base_system_prompt + "\n" + " ".join(turns)
126
+ full_prompt += f"<teacher> {teacher_question} </teacher>\n<student>"
 
127
  while len(tokenizer.encode(full_prompt, add_special_tokens=False)) > MAX_PROMPT_TOKENS and len(turns) > 0:
128
  turns.pop(0)
129
  convo_block = " ".join(turns)
130
  full_prompt = base_system_prompt + convo_block + f"<teacher> {teacher_question} </teacher>"
 
131
  return full_prompt.strip()
132
 
133
+ # =========================
134
+ # ❌ Banned Tokens
135
+ # =========================
136
+ def make_bad_words_ids(tokenizer, words: List[str]) -> List[List[int]]:
137
+ out = []
138
+ for w in words:
139
+ if w in tokenizer.all_special_tokens:
140
+ tid = tokenizer.convert_tokens_to_ids(w)
141
+ if tid != tokenizer.unk_token_id:
142
+ out.append([tid])
143
+ else:
144
+ toks = tokenizer.encode(w, add_special_tokens=False)
145
+ if toks:
146
+ out.append(toks)
147
+ return out
148
+
149
+ bad_words_ids = make_bad_words_ids(
150
+ tokenizer, ["<teacher>", "</teacher>", "<system>", "</system>", "Teacher:", "teacher:"]
151
+ )
152
+ eos_id = tokenizer.convert_tokens_to_ids("</student>")
153
+
154
+ # =========================
155
+ # ☁️ In-memory + Parquet HF Logging
156
+ # =========================
157
+ api = HfApi()
158
+ session_logs = {} # session_id -> list of rows
159
+ last_activity = {} # session_id -> last timestamp
160
+ log_lock = threading.Lock()
161
+
162
+ def add_turn_to_memory(session_id, username, strategy, teacher_msg, student_msg):
163
+ row = {
164
+ "timestamp": datetime.now().isoformat(timespec="seconds"),
165
+ "session_id": session_id,
166
+ "username": username,
167
+ "strategy": strategy,
168
+ "teacher": teacher_msg,
169
+ "student": student_msg,
170
+ }
171
+ with log_lock:
172
+ session_logs.setdefault(session_id, []).append(row)
173
+
174
+ def update_activity(session_id):
175
+ last_activity[session_id] = time.time()
176
+
177
+ def flush_session_to_hub(session_id):
178
+ """Append this session to one Parquet file in the private HF dataset."""
179
+ with log_lock:
180
+ if session_id not in session_logs or not session_logs[session_id]:
181
+ return
182
+ df = pd.DataFrame(session_logs[session_id])
183
+ del session_logs[session_id]
184
+ try:
185
+ ds = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
186
+ existing = ds.to_pandas()
187
+ combined = pd.concat([existing, df], ignore_index=True)
188
+ except Exception:
189
+ combined = df
190
+ with tempfile.NamedTemporaryFile("wb", delete=False, suffix=".parquet") as tmp:
191
+ combined.to_parquet(tmp.name, index=False)
192
+ tmp_path = tmp.name
193
+ api.upload_file(
194
+ path_or_fileobj=tmp_path,
195
+ path_in_repo="chat_logs.parquet",
196
+ repo_id=HF_DATASET_REPO,
197
+ repo_type="dataset",
198
+ token=HF_TOKEN,
199
+ )
200
+ os.remove(tmp_path)
201
+ print(f"[flush] Uploaded session {session_id} to HF dataset.")
202
+
203
+ # =========================
204
+ # ⏰ Inactivity + Tab Close Flush
205
+ # =========================
206
+ def check_inactivity_loop():
207
+ """Flush sessions inactive >10 min."""
208
+ while True:
209
+ now = time.time()
210
+ inactive = [sid for sid, ts in last_activity.items() if now - ts > INACTIVITY_LIMIT]
211
+ for sid in inactive:
212
+ try:
213
+ flush_session_to_hub(sid)
214
+ del last_activity[sid]
215
+ except Exception as e:
216
+ print(f"[auto-flush-error] {sid}: {e}")
217
+ time.sleep(60)
218
+
219
+ threading.Thread(target=check_inactivity_loop, daemon=True).start()
220
+
221
  # =========================
222
  # 🤖 Generation
223
  # =========================
 
227
  prompt,
228
  max_new_tokens=MAX_NEW_TOKENS,
229
  do_sample=True,
230
+ temperature=0.4,
231
  top_p=0.9,
232
  repetition_penalty=1.05,
233
+ no_repeat_ngram_size=6,
234
  pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
235
+ eos_token_id=eos_id,
236
+ bad_words_ids=bad_words_ids,
237
  return_full_text=False,
238
  )
239
  out_text = out[0]["generated_text"]
 
240
  if "<student>" in out_text and "</student>" in out_text:
241
  student_reply = out_text.split("<student>", 1)[1].split("</student>", 1)[0].strip()
242
  else:
243
  student_reply = out_text.strip()
244
+ student_reply = student_reply.split(".")[0].strip() + "."
245
  history.append((teacher_question, student_reply))
246
  log_turn(session_id, username, strategy, teacher_question, student_reply)
247
  return student_reply, history
 
260
  if not teacher_question.strip():
261
  gr.Warning("Please type a question for the student before sending.")
262
  return history, history, "", session_id
 
263
  student_reply, history = generate_response(
264
  teacher_question.strip(),
265
  username.strip(),
 
267
  session_id,
268
  strategy_choice.lower(),
269
  )
 
270
  msgs = []
271
  for t, s in history[-MAX_HISTORY_TURNS:]:
272
  msgs.append({"role": "user", "content": t})
273
  msgs.append({"role": "assistant", "content": s})
 
274
  return msgs, history, "", session_id
275
 
276
  def on_reset():
277
+ """Flush current session before resetting."""
278
+ if state_session and state_session.value:
279
+ flush_session_to_hub(state_session.value)
280
  return [], [], "", uuid.uuid4().hex[:12]
281
 
282
  # =========================
 
290
  )
291
 
292
  with gr.Row():
293
+ username = gr.Textbox(label="👤 Your Name (first last)", placeholder="Enter your name...")
294
  strategy_choice = gr.Dropdown(
295
  ["friendly", "differencing", "subtraction"],
296
  value="friendly",
 
302
  chat = gr.Chatbot(label="💬 Chat", type="messages")
303
  state_history = gr.State([])
304
  state_session = gr.State("")
 
305
 
306
+ send = gr.Button("Send", variant="primary")
307
  send.click(
308
  on_send,
309
  inputs=[teacher_q, username, strategy_choice, state_history, state_session],
310
  outputs=[chat, state_history, teacher_q, state_session],
311
  )
 
312
  reset_btn.click(
313
  on_reset,
314
  inputs=[],
315
  outputs=[chat, state_history, teacher_q, state_session],
316
  )
317
 
318
+
319
  if __name__ == "__main__":
320
  demo.queue()
321
  demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)