#!/usr/bin/env bash # scripts/learning/sync-claude-sessions.sh # # Read Claude Code session transcripts from ~/.claude/projects//*.jsonl # for ALL CUI Labs monorepos, scrub secrets, batch events, POST to # /api/learning/ingest. Tracks last-pushed cursor per session so it # only sends NEW lines on each tick. # # Christopher's directive 2026-05-05: "everything is a dataset that you # use - so let bee use it as well." This script is the local-machine # half of that directive — Vercel can't read from his disk; this can. # # Usage: # ./scripts/learning/sync-claude-sessions.sh # one-shot # ./scripts/learning/sync-claude-sessions.sh --dry-run # preview only # ./scripts/learning/sync-claude-sessions.sh --since=N # last N hours # # Requires: # - jq, curl, sed # - BEE_API_URL + CRON_SECRET in .env (or environment) # # Designed to run periodically via launchd or cron. Idempotent — re-runs # only push lines past the cursor. set -euo pipefail DRY_RUN=false SINCE_HOURS=24 for arg in "$@"; do case "$arg" in --dry-run) DRY_RUN=true ;; --since=*) SINCE_HOURS="${arg#--since=}" ;; *) echo "WARN: unknown flag '$arg'" >&2 ;; esac done REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" ENV_FILE="${REPO_ROOT}/.env" if [ -f "${ENV_FILE}" ]; then set -a # shellcheck disable=SC1090 source "${ENV_FILE}" set +a fi API_URL="${BEE_PORTAL_URL:-https://bee.cuilabs.io}" SECRET="${CRON_SECRET:-}" if [ -z "${SECRET}" ]; then echo "FATAL: CRON_SECRET not set (check ${ENV_FILE} or env)" >&2 exit 1 fi # NOTE: BEE_API_URL in .env points at the Modal/HF Python backend # (model inference), NOT the Next.js portal where /api/learning/ingest # lives. Use BEE_PORTAL_URL to override; default is production Vercel. CLAUDE_ROOT="${HOME}/.claude/projects" if [ ! -d "${CLAUDE_ROOT}" ]; then echo "FATAL: ${CLAUDE_ROOT} not found — Claude Code not installed?" >&2 exit 1 fi CURSOR_DIR="${HOME}/.claude/bee-sync" mkdir -p "${CURSOR_DIR}" log() { echo "[$(date -u +%H:%M:%S)] $*" } # Secrets we never want shipped to a public dataset. The list is # deliberately broader than strictly necessary — better to lose a real # token than to leak one. SCRUB_PATTERNS=( 's/sk-[a-zA-Z0-9_-]{20,}/sk-REDACTED/g' 's/AKIA[0-9A-Z]{16}/AKIA-REDACTED/g' 's/(api[_-]?key|API[_-]?KEY)[":= ]+[a-zA-Z0-9_-]{16,}/\1=REDACTED/g' 's/Bearer [a-zA-Z0-9._-]{16,}/Bearer REDACTED/g' 's/(ghp|gho|ghu|ghs|ghr)_[a-zA-Z0-9]{36,}/gh-REDACTED/g' 's/hf_[a-zA-Z0-9]{32,}/hf_REDACTED/g' 's/xox[bpoa]-[a-zA-Z0-9-]{10,}/slack-REDACTED/g' ) scrub() { local content="$1" for pat in "${SCRUB_PATTERNS[@]}"; do content="$(printf '%s' "${content}" | sed -E "${pat}")" done printf '%s' "${content}" } # Decode Claude's encoded-path directory naming back to a repo name: # "-Users-christopherfrost-Desktop-Bee" → "Bee" decode_repo() { local enc="$1" basename "${enc//-//}" } post_batch() { local repo="$1" local session_id="$2" local events_json="$3" local count count="$(printf '%s' "${events_json}" | jq 'length')" if [ "${count}" -eq 0 ]; then return 0 fi local payload payload="$(jq -n \ --arg src "claude_session" \ --arg sid "${session_id}" \ --arg repo "${repo}" \ --argjson events "${events_json}" \ '{source: $src, session_id: $sid, repo: $repo, events: $events}')" if [ "${DRY_RUN}" = "true" ]; then log "DRY-RUN: would POST ${count} events for ${repo} session=${session_id:0:8}" return 0 fi local resp if ! resp="$(curl -sS -X POST "${API_URL}/api/learning/ingest" \ -H "Authorization: Bearer ${SECRET}" \ -H "Content-Type: application/json" \ --data "${payload}" \ --max-time 60)"; then log "ERR: curl failed posting batch for ${repo} session=${session_id:0:8}" return 1 fi local status status="$(printf '%s' "${resp}" | jq -r '.status // "error"')" if [ "${status}" = "ok" ]; then local samples upload_path samples="$(printf '%s' "${resp}" | jq -r '.samples')" upload_path="$(printf '%s' "${resp}" | jq -r '.upload_path')" log "OK: ${repo} session=${session_id:0:8} → ${samples} samples in ${upload_path}" else # Surface the raw API response so debugging is auditable. Jq # extraction of .error // "unknown" was hiding the real cause. local err raw_resp if ! err="$(printf '%s' "${resp}" | jq -r '.error // empty' 2>/dev/null)"; then err="" fi if [ -z "${err}" ]; then raw_resp="$(printf '%s' "${resp}" | head -c 300)" err="raw_response: ${raw_resp}" fi log "ERR: ${repo} session=${session_id:0:8} → ${err}" fi } # Iterate every monorepo project Claude has tracked, every session. # Per-session cursor file stores the last byte offset we pushed. total_repos=0 total_sessions=0 total_events=0 # Global "first post in this script run" flag — so the 30 s throttle # applies between EVERY POST, not just chunk-to-chunk within a single # session. 2026-05-05 incident: a per-session flag let the first chunk # of every session fire immediately, producing 9+ commits in <10 s # (yesterday's 23:35:17–23:35:27 burst), which blew through HF's # 128/hr commit ceiling and turned the rest of the run into 502s # (the 54 % error rate Vercel reported on /api/learning/ingest). first_post=true for proj_dir in "${CLAUDE_ROOT}"/*/; do proj="$(basename "${proj_dir%/}")" repo="$(decode_repo "${proj}")" total_repos=$((total_repos + 1)) for transcript in "${proj_dir}"*.jsonl; do [ -e "${transcript}" ] || continue session_id="$(basename "${transcript}" .jsonl)" cursor_file="${CURSOR_DIR}/${proj}__${session_id}.cursor" last_offset=0 if [ -f "${cursor_file}" ]; then last_offset="$(cat "${cursor_file}")" fi file_size="$(stat -f %z "${transcript}" 2>/dev/null || stat -c %s "${transcript}")" if [ "${last_offset}" -ge "${file_size}" ]; then continue fi # Cutoff: skip files older than --since= hours since last modify. file_mtime="$(stat -f %m "${transcript}" 2>/dev/null || stat -c %Y "${transcript}")" now_epoch="$(date +%s)" age_hours=$(( (now_epoch - file_mtime) / 3600 )) if [ "${age_hours}" -gt "${SINCE_HOURS}" ]; then continue fi total_sessions=$((total_sessions + 1)) # Read new lines since cursor. Each line is a Claude session event. # We parse LINE BY LINE (not jq -s) because Claude transcripts can # legitimately contain literal control characters inside text fields # — `jq -s` aborts on the first bad line and loses the whole batch. # `jq -c -R 'fromjson? | ...'` skips bad lines silently and emits # one compact line per good event, which we then bundle into a JSON # array via the final `jq -s`. Scrub secrets downstream. tmp_jsonl="/tmp/bee-sync-events-$$-$(echo "${session_id}" | cut -c1-8).jsonl" tail -c "+$((last_offset + 1))" "${transcript}" | jq -c -R ' fromjson? | select(.type == "user" or .type == "assistant") | { type: .type, timestamp: .timestamp, text: ( if .message.content | type == "string" then .message.content elif .message.content | type == "array" then (.message.content | map( if .type == "text" then .text elif .type == "tool_use" then "[tool_use:" + .name + "] " + (.input | tostring | .[0:2000]) elif .type == "tool_result" then "[tool_result] " + (.content | tostring | .[0:2000]) else "" end ) | join("\n")) else "" end ) } ' > "${tmp_jsonl}" 2>/dev/null if [ ! -s "${tmp_jsonl}" ]; then events_json="[]" else events_json="$(jq -s '.' "${tmp_jsonl}")" fi rm -f "${tmp_jsonl}" # Scrub secrets in the joined text fields. We do it as a final pass # over the JSON string for simplicity — sed runs over each pattern. for pat in "${SCRUB_PATTERNS[@]}"; do events_json="$(printf '%s' "${events_json}" | sed -E "${pat}")" done event_count="$(printf '%s' "${events_json}" | jq 'length')" total_events=$((total_events + event_count)) if [ "${event_count}" -gt 0 ]; then # Chunk size 500 (was 50) — HF rate-limits commits to 128/hr per # repo, and prior chunk=50 produced ~10x more commits than needed. # 2026-05-05 sync hit 429 after 70 commits with chunk=50. # 500 events ≈ ~1MB payload, well inside Vercel's 4.5MB body limit. # Plus 30s sleep between POSTs so 30 sessions × ~5 batches each # spread across ~75 min — comfortably under the 128/hr ceiling. offset=0 while [ "${offset}" -lt "${event_count}" ]; do if [ "${first_post}" = "false" ] && [ "${DRY_RUN}" != "true" ]; then sleep 30 fi chunk="$(printf '%s' "${events_json}" | jq ".[${offset}:${offset}+500]")" post_batch "${repo}" "${session_id}" "${chunk}" offset=$((offset + 500)) first_post=false done fi # Update cursor to current EOF — even if nothing landed, we mark # this section processed so the next run starts past it. if [ "${DRY_RUN}" != "true" ]; then printf '%s' "${file_size}" > "${cursor_file}" fi done done log "scan complete: repos=${total_repos} sessions=${total_sessions} events=${total_events}"