hparten commited on
Commit
ef396ec
·
1 Parent(s): 9ce3c16

updated logging

Browse files
Files changed (1) hide show
  1. app.py +135 -19
app.py CHANGED
@@ -15,6 +15,9 @@ from transformers import (
15
  StoppingCriteriaList,
16
  )
17
  from peft import PeftModel
 
 
 
18
 
19
  # =========================
20
  # ⚙️ Config
@@ -85,21 +88,21 @@ def build_system_block(problem_prefix, strategy):
85
  # =========================
86
  # 🧾 Logging
87
  # =========================
88
- CSV_HEADERS = ["timestamp", "session_id", "username", "strategy", "teacher", "student"]
89
-
90
- def _append_csv(path, row):
91
- with FileLock(LOCK_PATH):
92
- file_exists = os.path.exists(path)
93
- with open(path, "a", newline="", encoding="utf-8") as f:
94
- w = csv.writer(f)
95
- if not file_exists:
96
- w.writerow(CSV_HEADERS)
97
- w.writerow(row)
98
-
99
- def log_turn(session_id, username, strategy, teacher_msg, student_msg):
100
- row = [datetime.now().isoformat(timespec="seconds"), session_id, username, strategy, teacher_msg, student_msg]
101
- per_session = os.path.join(LOG_DIR, f"chat_{session_id}.csv")
102
- _append_csv(per_session, row)
103
 
104
  # =========================
105
  # 🧩 Prompt builder
@@ -144,6 +147,64 @@ bad_words_ids = make_bad_words_ids(
144
  )
145
  eos_id = tokenizer.convert_tokens_to_ids("</student>")
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  # =========================
148
  # 🤖 Generation
149
  # =========================
@@ -174,9 +235,67 @@ def generate_response(teacher_question, username, history, session_id, strategy)
174
  student_reply = student_reply.split(".")[0].strip() + "."
175
 
176
  history.append((teacher_question, student_reply))
177
- log_turn(session_id, username, strategy, teacher_question, student_reply)
178
 
179
  return student_reply, history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  # =========================
181
  # 🖥 Gradio UI
182
  # =========================
@@ -216,7 +335,6 @@ def on_reset(chat, history, teacher_q, session_id):
216
  except Exception as e:
217
  print(f"[manual flush error] {session_id}: {e}")
218
 
219
- # return cleared state and a new session id
220
  return [], [], "", uuid.uuid4().hex[:12]
221
 
222
  # =========================
@@ -257,8 +375,6 @@ with gr.Blocks(title="Elementary Math Student Chatbot") as demo:
257
  outputs=[chat, state_history, teacher_q, state_session],
258
  )
259
 
260
- if __name__ == "__main__":
261
- demo.launch()
262
 
263
  if __name__ == "__main__":
264
  demo.queue()
 
15
  StoppingCriteriaList,
16
  )
17
  from peft import PeftModel
18
+ import tempfile
19
+ import pandas as pd
20
+ from datasets import load_dataset
21
 
22
  # =========================
23
  # ⚙️ Config
 
88
  # =========================
89
  # 🧾 Logging
90
  # =========================
91
+ #CSV_HEADERS = ["timestamp", "session_id", "username", "strategy", "teacher", "student"]
92
+ #
93
+ #def _append_csv(path, row):
94
+ # with FileLock(LOCK_PATH):
95
+ # file_exists = os.path.exists(path)
96
+ # with open(path, "a", newline="", encoding="utf-8") as f:
97
+ # w = csv.writer(f)
98
+ # if not file_exists:
99
+ # w.writerow(CSV_HEADERS)
100
+ # w.writerow(row)
101
+ #
102
+ #def log_turn(session_id, username, strategy, teacher_msg, student_msg):
103
+ # row = [datetime.now().isoformat(timespec="seconds"), session_id, username, strategy, #teacher_msg, student_msg]
104
+ # per_session = os.path.join(LOG_DIR, f"chat_{session_id}.csv")
105
+ # _append_csv(per_session, row)
106
 
107
  # =========================
108
  # 🧩 Prompt builder
 
147
  )
148
  eos_id = tokenizer.convert_tokens_to_ids("</student>")
149
 
150
+ # =========================
151
+ # ☁️ In-Memory Logging + HF Upload
152
+ # =========================
153
+
154
+ HF_DATASET_REPO = "hparten/math_chatbot_logs"
155
+
156
+ api = HfApi()
157
+ session_logs = {} # session_id -> list of turns
158
+ last_activity = {} # session_id -> timestamp
159
+
160
+ def add_turn_to_memory(session_id, username, strategy, teacher_msg, student_msg):
161
+ """Store one turn in memory."""
162
+ from datetime import datetime
163
+ row = {
164
+ "timestamp": datetime.now().isoformat(timespec="seconds"),
165
+ "session_id": session_id,
166
+ "username": username,
167
+ "strategy": strategy,
168
+ "teacher": teacher_msg,
169
+ "student": student_msg,
170
+ }
171
+ session_logs.setdefault(session_id, []).append(row)
172
+ update_activity(session_id)
173
+
174
+ def update_activity(session_id):
175
+ import time
176
+ last_activity[session_id] = time.time()
177
+
178
+ def flush_session_to_hub(session_id):
179
+ """Upload session logs to Hugging Face dataset as a single Parquet file."""
180
+ if session_id not in session_logs or not session_logs[session_id]:
181
+ print(f"[flush] No logs found for session {session_id}")
182
+ return
183
+
184
+ df = pd.DataFrame(session_logs[session_id])
185
+ del session_logs[session_id]
186
+
187
+ try:
188
+ ds = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
189
+ existing = ds.to_pandas()
190
+ combined = pd.concat([existing, df], ignore_index=True)
191
+ except Exception:
192
+ combined = df
193
+
194
+ with tempfile.NamedTemporaryFile("wb", delete=False, suffix=".parquet") as tmp:
195
+ combined.to_parquet(tmp.name, index=False)
196
+ tmp_path = tmp.name
197
+
198
+ api.upload_file(
199
+ path_or_fileobj=tmp_path,
200
+ path_in_repo="chat_logs.parquet",
201
+ repo_id=HF_DATASET_REPO,
202
+ repo_type="dataset",
203
+ token=HF_TOKEN,
204
+ )
205
+ os.remove(tmp_path)
206
+ print(f"[flush] Uploaded session {session_id} to HF dataset.")
207
+
208
  # =========================
209
  # 🤖 Generation
210
  # =========================
 
235
  student_reply = student_reply.split(".")[0].strip() + "."
236
 
237
  history.append((teacher_question, student_reply))
238
+ add_turn_to_memory(session_id, username, strategy, teacher_question, student_reply)
239
 
240
  return student_reply, history
241
+
242
+ # =========================
243
+ # ☁️ Flush session logs to Hugging Face Hub
244
+ # =========================
245
+ #
246
+ #def flush_session_to_hub(session_id):
247
+ # """Append this session to one Parquet file in the private HF dataset."""
248
+ # if session_id not in session_logs or not session_logs[session_id]:
249
+ # print(f"[flush] No logs found for session {session_id}")
250
+ # return
251
+ #
252
+ # df = pd.DataFrame(session_logs[session_id])
253
+ # del session_logs[session_id]
254
+ #
255
+ # try:
256
+ # ds = load_dataset(HF_DATASET_REPO, split="train", token=HF_TOKEN)
257
+ # existing = ds.to_pandas()
258
+ # combined = pd.concat([existing, df], ignore_index=True)
259
+ # except Exception:
260
+ # combined = df
261
+ #
262
+ # with tempfile.NamedTemporaryFile("wb", delete=False, suffix=".parquet") as tmp:
263
+ # combined.to_parquet(tmp.name, index=False)
264
+ # tmp_path = tmp.name
265
+ #
266
+ # api.upload_file(
267
+ # path_or_fileobj=tmp_path,
268
+ # path_in_repo="chat_logs.parquet",
269
+ # repo_id=HF_DATASET_REPO,
270
+ # repo_type="dataset",
271
+ # token=HF_TOKEN,
272
+ # )
273
+ #
274
+ # os.remove(tmp_path)
275
+ # print(f"[flush] Uploaded session {session_id} to HF dataset.")
276
+
277
+ # =========================
278
+ # Inactivity flush
279
+ # =========================
280
+
281
+ import threading, time
282
+
283
+ INACTIVITY_LIMIT = 600 # 10 minutes
284
+
285
+ def check_inactivity_loop():
286
+ while True:
287
+ now = time.time()
288
+ inactive = [sid for sid, ts in last_activity.items() if now - ts > INACTIVITY_LIMIT]
289
+ for sid in inactive:
290
+ try:
291
+ flush_session_to_hub(sid)
292
+ del last_activity[sid]
293
+ except Exception as e:
294
+ print(f"[auto-flush-error] {sid}: {e}")
295
+ time.sleep(60)
296
+
297
+ threading.Thread(target=check_inactivity_loop, daemon=True).start()
298
+
299
  # =========================
300
  # 🖥 Gradio UI
301
  # =========================
 
335
  except Exception as e:
336
  print(f"[manual flush error] {session_id}: {e}")
337
 
 
338
  return [], [], "", uuid.uuid4().hex[:12]
339
 
340
  # =========================
 
375
  outputs=[chat, state_history, teacher_q, state_session],
376
  )
377
 
 
 
378
 
379
  if __name__ == "__main__":
380
  demo.queue()