Spaces:
Sleeping
Sleeping
Monish BV commited on
Commit ·
eb4dde5
1
Parent(s): 53def98
updated data storage
Browse files- backend/main.py +35 -10
backend/main.py
CHANGED
|
@@ -62,6 +62,7 @@ DATA_DIR = BASE_DIR / "storage"
|
|
| 62 |
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 63 |
|
| 64 |
HISTORY_FILE = DATA_DIR / "chat_history.jsonl"
|
|
|
|
| 65 |
|
| 66 |
DEFAULT_SESSION = "default"
|
| 67 |
|
|
@@ -153,6 +154,27 @@ def _get_env_value(name: Optional[str]) -> str:
|
|
| 153 |
return os.getenv(f"HF_{name}", "").strip()
|
| 154 |
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
def _maybe_download_existing_history() -> None:
|
| 157 |
"""Download existing chat history from HF dataset on startup."""
|
| 158 |
repo_id = os.getenv("KIOSK_HF_DATASET_REPO", "").strip()
|
|
@@ -185,21 +207,17 @@ def _maybe_download_existing_history() -> None:
|
|
| 185 |
|
| 186 |
|
| 187 |
def _maybe_start_hf_sync() -> None:
|
| 188 |
-
"""Start optional HF dataset syncing for chat history."""
|
| 189 |
global _hf_scheduler
|
| 190 |
-
|
| 191 |
if _hf_scheduler is not None:
|
| 192 |
return
|
| 193 |
-
|
| 194 |
repo_id = os.getenv("KIOSK_HF_DATASET_REPO", "").strip()
|
| 195 |
if not repo_id or CommitScheduler is None:
|
| 196 |
return
|
| 197 |
-
|
| 198 |
_load_env_once()
|
| 199 |
token = _get_env_value("KIOSK_HF_TOKEN") or os.getenv("HF_TOKEN", "").strip()
|
| 200 |
path_in_repo = os.getenv("KIOSK_HF_DATASET_PATH", "chat_history").strip()
|
| 201 |
interval_minutes = float(os.getenv("KIOSK_HF_SYNC_INTERVAL_MINUTES", "10"))
|
| 202 |
-
|
| 203 |
try:
|
| 204 |
_hf_scheduler = CommitScheduler(
|
| 205 |
repo_id=repo_id,
|
|
@@ -207,14 +225,12 @@ def _maybe_start_hf_sync() -> None:
|
|
| 207 |
folder_path=str(DATA_DIR),
|
| 208 |
path_in_repo=path_in_repo,
|
| 209 |
token=token or None,
|
| 210 |
-
allow_patterns=[HISTORY_FILE.name],
|
| 211 |
every=interval_minutes,
|
| 212 |
)
|
| 213 |
logger.info(
|
| 214 |
-
"Started HF CommitScheduler for
|
| 215 |
-
repo_id,
|
| 216 |
-
path_in_repo or ".",
|
| 217 |
-
interval_minutes,
|
| 218 |
)
|
| 219 |
except Exception as exc:
|
| 220 |
warnings.warn(f"Unable to start HF sync: {exc}")
|
|
@@ -224,6 +240,7 @@ def _run_startup_tasks_in_background() -> None:
|
|
| 224 |
"""Run HF download and sync in a background thread so the server starts immediately."""
|
| 225 |
def _run() -> None:
|
| 226 |
try:
|
|
|
|
| 227 |
_maybe_download_existing_history()
|
| 228 |
_maybe_start_hf_sync()
|
| 229 |
except Exception as exc:
|
|
@@ -515,6 +532,14 @@ def record_history(
|
|
| 515 |
"action": action,
|
| 516 |
}
|
| 517 |
_append_json_line(HISTORY_FILE, payload)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
return timestamp
|
| 519 |
|
| 520 |
|
|
|
|
| 62 |
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 63 |
|
| 64 |
HISTORY_FILE = DATA_DIR / "chat_history.jsonl"
|
| 65 |
+
USAGE_FILE = DATA_DIR / "usage_metrics.jsonl"
|
| 66 |
|
| 67 |
DEFAULT_SESSION = "default"
|
| 68 |
|
|
|
|
| 154 |
return os.getenv(f"HF_{name}", "").strip()
|
| 155 |
|
| 156 |
|
| 157 |
+
def _maybe_download_existing_metrics() -> None:
|
| 158 |
+
"""Download existing usage metrics from HF dataset on startup."""
|
| 159 |
+
repo_id = os.getenv("KIOSK_HF_DATASET_REPO", "").strip()
|
| 160 |
+
if not repo_id or hf_hub_download is None:
|
| 161 |
+
return
|
| 162 |
+
_load_env_once()
|
| 163 |
+
token = _get_env_value("KIOSK_HF_TOKEN") or os.getenv("HF_TOKEN", "").strip()
|
| 164 |
+
path_in_repo = os.getenv("KIOSK_HF_DATASET_PATH", "chat_history").strip()
|
| 165 |
+
filename = f"{path_in_repo}/{USAGE_FILE.name}" if path_in_repo else USAGE_FILE.name
|
| 166 |
+
try:
|
| 167 |
+
import shutil
|
| 168 |
+
downloaded = hf_hub_download(
|
| 169 |
+
repo_id=repo_id, repo_type="dataset", filename=filename, token=token or None,
|
| 170 |
+
)
|
| 171 |
+
USAGE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
| 172 |
+
shutil.copy(downloaded, USAGE_FILE)
|
| 173 |
+
logger.info("Downloaded usage metrics from HF: repo=%s file=%s", repo_id, filename)
|
| 174 |
+
except Exception as exc:
|
| 175 |
+
logger.info("No existing metrics to download (starting fresh): %s", exc)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
def _maybe_download_existing_history() -> None:
|
| 179 |
"""Download existing chat history from HF dataset on startup."""
|
| 180 |
repo_id = os.getenv("KIOSK_HF_DATASET_REPO", "").strip()
|
|
|
|
| 207 |
|
| 208 |
|
| 209 |
def _maybe_start_hf_sync() -> None:
|
| 210 |
+
"""Start optional HF dataset syncing for chat history and usage metrics."""
|
| 211 |
global _hf_scheduler
|
|
|
|
| 212 |
if _hf_scheduler is not None:
|
| 213 |
return
|
|
|
|
| 214 |
repo_id = os.getenv("KIOSK_HF_DATASET_REPO", "").strip()
|
| 215 |
if not repo_id or CommitScheduler is None:
|
| 216 |
return
|
|
|
|
| 217 |
_load_env_once()
|
| 218 |
token = _get_env_value("KIOSK_HF_TOKEN") or os.getenv("HF_TOKEN", "").strip()
|
| 219 |
path_in_repo = os.getenv("KIOSK_HF_DATASET_PATH", "chat_history").strip()
|
| 220 |
interval_minutes = float(os.getenv("KIOSK_HF_SYNC_INTERVAL_MINUTES", "10"))
|
|
|
|
| 221 |
try:
|
| 222 |
_hf_scheduler = CommitScheduler(
|
| 223 |
repo_id=repo_id,
|
|
|
|
| 225 |
folder_path=str(DATA_DIR),
|
| 226 |
path_in_repo=path_in_repo,
|
| 227 |
token=token or None,
|
| 228 |
+
allow_patterns=[HISTORY_FILE.name, USAGE_FILE.name],
|
| 229 |
every=interval_minutes,
|
| 230 |
)
|
| 231 |
logger.info(
|
| 232 |
+
"Started HF CommitScheduler for chat_history and usage_metrics: repo=%s path=%s interval=%s",
|
| 233 |
+
repo_id, path_in_repo or ".", interval_minutes,
|
|
|
|
|
|
|
| 234 |
)
|
| 235 |
except Exception as exc:
|
| 236 |
warnings.warn(f"Unable to start HF sync: {exc}")
|
|
|
|
| 240 |
"""Run HF download and sync in a background thread so the server starts immediately."""
|
| 241 |
def _run() -> None:
|
| 242 |
try:
|
| 243 |
+
_maybe_download_existing_metrics()
|
| 244 |
_maybe_download_existing_history()
|
| 245 |
_maybe_start_hf_sync()
|
| 246 |
except Exception as exc:
|
|
|
|
| 532 |
"action": action,
|
| 533 |
}
|
| 534 |
_append_json_line(HISTORY_FILE, payload)
|
| 535 |
+
usage_entry = {
|
| 536 |
+
"timestamp": timestamp,
|
| 537 |
+
"session_id": session_id,
|
| 538 |
+
"blueprint": blueprint,
|
| 539 |
+
"question": question,
|
| 540 |
+
}
|
| 541 |
+
usage_entry.setdefault("action_type", action.get("type"))
|
| 542 |
+
_append_json_line(USAGE_FILE, usage_entry)
|
| 543 |
return timestamp
|
| 544 |
|
| 545 |
|