Monish BV commited on
Commit
eb4dde5
·
1 Parent(s): 53def98

updated data storage

Browse files
Files changed (1) hide show
  1. backend/main.py +35 -10
backend/main.py CHANGED
@@ -62,6 +62,7 @@ DATA_DIR = BASE_DIR / "storage"
62
  DATA_DIR.mkdir(parents=True, exist_ok=True)
63
 
64
  HISTORY_FILE = DATA_DIR / "chat_history.jsonl"
 
65
 
66
  DEFAULT_SESSION = "default"
67
 
@@ -153,6 +154,27 @@ def _get_env_value(name: Optional[str]) -> str:
153
  return os.getenv(f"HF_{name}", "").strip()
154
 
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def _maybe_download_existing_history() -> None:
157
  """Download existing chat history from HF dataset on startup."""
158
  repo_id = os.getenv("KIOSK_HF_DATASET_REPO", "").strip()
@@ -185,21 +207,17 @@ def _maybe_download_existing_history() -> None:
185
 
186
 
187
  def _maybe_start_hf_sync() -> None:
188
- """Start optional HF dataset syncing for chat history."""
189
  global _hf_scheduler
190
-
191
  if _hf_scheduler is not None:
192
  return
193
-
194
  repo_id = os.getenv("KIOSK_HF_DATASET_REPO", "").strip()
195
  if not repo_id or CommitScheduler is None:
196
  return
197
-
198
  _load_env_once()
199
  token = _get_env_value("KIOSK_HF_TOKEN") or os.getenv("HF_TOKEN", "").strip()
200
  path_in_repo = os.getenv("KIOSK_HF_DATASET_PATH", "chat_history").strip()
201
  interval_minutes = float(os.getenv("KIOSK_HF_SYNC_INTERVAL_MINUTES", "10"))
202
-
203
  try:
204
  _hf_scheduler = CommitScheduler(
205
  repo_id=repo_id,
@@ -207,14 +225,12 @@ def _maybe_start_hf_sync() -> None:
207
  folder_path=str(DATA_DIR),
208
  path_in_repo=path_in_repo,
209
  token=token or None,
210
- allow_patterns=[HISTORY_FILE.name],
211
  every=interval_minutes,
212
  )
213
  logger.info(
214
- "Started HF CommitScheduler for chat history: repo=%s path=%s interval=%sm",
215
- repo_id,
216
- path_in_repo or ".",
217
- interval_minutes,
218
  )
219
  except Exception as exc:
220
  warnings.warn(f"Unable to start HF sync: {exc}")
@@ -224,6 +240,7 @@ def _run_startup_tasks_in_background() -> None:
224
  """Run HF download and sync in a background thread so the server starts immediately."""
225
  def _run() -> None:
226
  try:
 
227
  _maybe_download_existing_history()
228
  _maybe_start_hf_sync()
229
  except Exception as exc:
@@ -515,6 +532,14 @@ def record_history(
515
  "action": action,
516
  }
517
  _append_json_line(HISTORY_FILE, payload)
 
 
 
 
 
 
 
 
518
  return timestamp
519
 
520
 
 
62
  DATA_DIR.mkdir(parents=True, exist_ok=True)
63
 
64
  HISTORY_FILE = DATA_DIR / "chat_history.jsonl"
65
+ USAGE_FILE = DATA_DIR / "usage_metrics.jsonl"
66
 
67
  DEFAULT_SESSION = "default"
68
 
 
154
  return os.getenv(f"HF_{name}", "").strip()
155
 
156
 
157
+ def _maybe_download_existing_metrics() -> None:
158
+ """Download existing usage metrics from HF dataset on startup."""
159
+ repo_id = os.getenv("KIOSK_HF_DATASET_REPO", "").strip()
160
+ if not repo_id or hf_hub_download is None:
161
+ return
162
+ _load_env_once()
163
+ token = _get_env_value("KIOSK_HF_TOKEN") or os.getenv("HF_TOKEN", "").strip()
164
+ path_in_repo = os.getenv("KIOSK_HF_DATASET_PATH", "chat_history").strip()
165
+ filename = f"{path_in_repo}/{USAGE_FILE.name}" if path_in_repo else USAGE_FILE.name
166
+ try:
167
+ import shutil
168
+ downloaded = hf_hub_download(
169
+ repo_id=repo_id, repo_type="dataset", filename=filename, token=token or None,
170
+ )
171
+ USAGE_FILE.parent.mkdir(parents=True, exist_ok=True)
172
+ shutil.copy(downloaded, USAGE_FILE)
173
+ logger.info("Downloaded usage metrics from HF: repo=%s file=%s", repo_id, filename)
174
+ except Exception as exc:
175
+ logger.info("No existing metrics to download (starting fresh): %s", exc)
176
+
177
+
178
  def _maybe_download_existing_history() -> None:
179
  """Download existing chat history from HF dataset on startup."""
180
  repo_id = os.getenv("KIOSK_HF_DATASET_REPO", "").strip()
 
207
 
208
 
209
  def _maybe_start_hf_sync() -> None:
210
+ """Start optional HF dataset syncing for chat history and usage metrics."""
211
  global _hf_scheduler
 
212
  if _hf_scheduler is not None:
213
  return
 
214
  repo_id = os.getenv("KIOSK_HF_DATASET_REPO", "").strip()
215
  if not repo_id or CommitScheduler is None:
216
  return
 
217
  _load_env_once()
218
  token = _get_env_value("KIOSK_HF_TOKEN") or os.getenv("HF_TOKEN", "").strip()
219
  path_in_repo = os.getenv("KIOSK_HF_DATASET_PATH", "chat_history").strip()
220
  interval_minutes = float(os.getenv("KIOSK_HF_SYNC_INTERVAL_MINUTES", "10"))
 
221
  try:
222
  _hf_scheduler = CommitScheduler(
223
  repo_id=repo_id,
 
225
  folder_path=str(DATA_DIR),
226
  path_in_repo=path_in_repo,
227
  token=token or None,
228
+ allow_patterns=[HISTORY_FILE.name, USAGE_FILE.name],
229
  every=interval_minutes,
230
  )
231
  logger.info(
232
+ "Started HF CommitScheduler for chat_history and usage_metrics: repo=%s path=%s interval=%s",
233
+ repo_id, path_in_repo or ".", interval_minutes,
 
 
234
  )
235
  except Exception as exc:
236
  warnings.warn(f"Unable to start HF sync: {exc}")
 
240
  """Run HF download and sync in a background thread so the server starts immediately."""
241
  def _run() -> None:
242
  try:
243
+ _maybe_download_existing_metrics()
244
  _maybe_download_existing_history()
245
  _maybe_start_hf_sync()
246
  except Exception as exc:
 
532
  "action": action,
533
  }
534
  _append_json_line(HISTORY_FILE, payload)
535
+ usage_entry = {
536
+ "timestamp": timestamp,
537
+ "session_id": session_id,
538
+ "blueprint": blueprint,
539
+ "question": question,
540
+ }
541
+ usage_entry.setdefault("action_type", action.get("type"))
542
+ _append_json_line(USAGE_FILE, usage_entry)
543
  return timestamp
544
 
545