bee / scripts /learning /sync-claude-sessions.sh
Bee Deploy
HF Space backend deploy [de0cba5]
5e21013
#!/usr/bin/env bash
# scripts/learning/sync-claude-sessions.sh
#
# Read Claude Code session transcripts from ~/.claude/projects/<repo>/*.jsonl
# for ALL CUI Labs monorepos, scrub secrets, batch events, POST to
# /api/learning/ingest. Tracks last-pushed cursor per session so it
# only sends NEW lines on each tick.
#
# Christopher's directive 2026-05-05: "everything is a dataset that you
# use - so let bee use it as well." This script is the local-machine
# half of that directive β€” Vercel can't read from his disk; this can.
#
# Usage:
# ./scripts/learning/sync-claude-sessions.sh # one-shot
# ./scripts/learning/sync-claude-sessions.sh --dry-run # preview only
# ./scripts/learning/sync-claude-sessions.sh --since=N # last N hours
#
# Requires:
# - jq, curl, sed
# - BEE_API_URL + CRON_SECRET in .env (or environment)
#
# Designed to run periodically via launchd or cron. Idempotent β€” re-runs
# only push lines past the cursor.
set -euo pipefail
DRY_RUN=false
SINCE_HOURS=24
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
--since=*) SINCE_HOURS="${arg#--since=}" ;;
*) echo "WARN: unknown flag '$arg'" >&2 ;;
esac
done
REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
ENV_FILE="${REPO_ROOT}/.env"
if [ -f "${ENV_FILE}" ]; then
set -a
# shellcheck disable=SC1090
source "${ENV_FILE}"
set +a
fi
API_URL="${BEE_PORTAL_URL:-https://bee.cuilabs.io}"
SECRET="${CRON_SECRET:-}"
if [ -z "${SECRET}" ]; then
echo "FATAL: CRON_SECRET not set (check ${ENV_FILE} or env)" >&2
exit 1
fi
# NOTE: BEE_API_URL in .env points at the Modal/HF Python backend
# (model inference), NOT the Next.js portal where /api/learning/ingest
# lives. Use BEE_PORTAL_URL to override; default is production Vercel.
CLAUDE_ROOT="${HOME}/.claude/projects"
if [ ! -d "${CLAUDE_ROOT}" ]; then
echo "FATAL: ${CLAUDE_ROOT} not found β€” Claude Code not installed?" >&2
exit 1
fi
CURSOR_DIR="${HOME}/.claude/bee-sync"
mkdir -p "${CURSOR_DIR}"
log() {
echo "[$(date -u +%H:%M:%S)] $*"
}
# Secrets we never want shipped to a public dataset. The list is
# deliberately broader than strictly necessary β€” better to lose a real
# token than to leak one.
SCRUB_PATTERNS=(
's/sk-[a-zA-Z0-9_-]{20,}/sk-REDACTED/g'
's/AKIA[0-9A-Z]{16}/AKIA-REDACTED/g'
's/(api[_-]?key|API[_-]?KEY)[":= ]+[a-zA-Z0-9_-]{16,}/\1=REDACTED/g'
's/Bearer [a-zA-Z0-9._-]{16,}/Bearer REDACTED/g'
's/(ghp|gho|ghu|ghs|ghr)_[a-zA-Z0-9]{36,}/gh-REDACTED/g'
's/hf_[a-zA-Z0-9]{32,}/hf_REDACTED/g'
's/xox[bpoa]-[a-zA-Z0-9-]{10,}/slack-REDACTED/g'
)
scrub() {
local content="$1"
for pat in "${SCRUB_PATTERNS[@]}"; do
content="$(printf '%s' "${content}" | sed -E "${pat}")"
done
printf '%s' "${content}"
}
# Decode Claude's encoded-path directory naming back to a repo name:
# "-Users-christopherfrost-Desktop-Bee" β†’ "Bee"
decode_repo() {
local enc="$1"
basename "${enc//-//}"
}
post_batch() {
local repo="$1"
local session_id="$2"
local events_json="$3"
local count
count="$(printf '%s' "${events_json}" | jq 'length')"
if [ "${count}" -eq 0 ]; then
return 0
fi
local payload
payload="$(jq -n \
--arg src "claude_session" \
--arg sid "${session_id}" \
--arg repo "${repo}" \
--argjson events "${events_json}" \
'{source: $src, session_id: $sid, repo: $repo, events: $events}')"
if [ "${DRY_RUN}" = "true" ]; then
log "DRY-RUN: would POST ${count} events for ${repo} session=${session_id:0:8}"
return 0
fi
local resp
if ! resp="$(curl -sS -X POST "${API_URL}/api/learning/ingest" \
-H "Authorization: Bearer ${SECRET}" \
-H "Content-Type: application/json" \
--data "${payload}" \
--max-time 60)"; then
log "ERR: curl failed posting batch for ${repo} session=${session_id:0:8}"
return 1
fi
local status
status="$(printf '%s' "${resp}" | jq -r '.status // "error"')"
if [ "${status}" = "ok" ]; then
local samples upload_path
samples="$(printf '%s' "${resp}" | jq -r '.samples')"
upload_path="$(printf '%s' "${resp}" | jq -r '.upload_path')"
log "OK: ${repo} session=${session_id:0:8} β†’ ${samples} samples in ${upload_path}"
else
# Surface the raw API response so debugging is auditable. Jq
# extraction of .error // "unknown" was hiding the real cause.
local err raw_resp
if ! err="$(printf '%s' "${resp}" | jq -r '.error // empty' 2>/dev/null)"; then
err=""
fi
if [ -z "${err}" ]; then
raw_resp="$(printf '%s' "${resp}" | head -c 300)"
err="raw_response: ${raw_resp}"
fi
log "ERR: ${repo} session=${session_id:0:8} β†’ ${err}"
fi
}
# Iterate every monorepo project Claude has tracked, every session.
# Per-session cursor file stores the last byte offset we pushed.
total_repos=0
total_sessions=0
total_events=0
# Global "first post in this script run" flag β€” so the 30 s throttle
# applies between EVERY POST, not just chunk-to-chunk within a single
# session. 2026-05-05 incident: a per-session flag let the first chunk
# of every session fire immediately, producing 9+ commits in <10 s
# (yesterday's 23:35:17–23:35:27 burst), which blew through HF's
# 128/hr commit ceiling and turned the rest of the run into 502s
# (the 54 % error rate Vercel reported on /api/learning/ingest).
first_post=true
for proj_dir in "${CLAUDE_ROOT}"/*/; do
proj="$(basename "${proj_dir%/}")"
repo="$(decode_repo "${proj}")"
total_repos=$((total_repos + 1))
for transcript in "${proj_dir}"*.jsonl; do
[ -e "${transcript}" ] || continue
session_id="$(basename "${transcript}" .jsonl)"
cursor_file="${CURSOR_DIR}/${proj}__${session_id}.cursor"
last_offset=0
if [ -f "${cursor_file}" ]; then
last_offset="$(cat "${cursor_file}")"
fi
file_size="$(stat -f %z "${transcript}" 2>/dev/null || stat -c %s "${transcript}")"
if [ "${last_offset}" -ge "${file_size}" ]; then
continue
fi
# Cutoff: skip files older than --since= hours since last modify.
file_mtime="$(stat -f %m "${transcript}" 2>/dev/null || stat -c %Y "${transcript}")"
now_epoch="$(date +%s)"
age_hours=$(( (now_epoch - file_mtime) / 3600 ))
if [ "${age_hours}" -gt "${SINCE_HOURS}" ]; then
continue
fi
total_sessions=$((total_sessions + 1))
# Read new lines since cursor. Each line is a Claude session event.
# We parse LINE BY LINE (not jq -s) because Claude transcripts can
# legitimately contain literal control characters inside text fields
# β€” `jq -s` aborts on the first bad line and loses the whole batch.
# `jq -c -R 'fromjson? | ...'` skips bad lines silently and emits
# one compact line per good event, which we then bundle into a JSON
# array via the final `jq -s`. Scrub secrets downstream.
tmp_jsonl="/tmp/bee-sync-events-$$-$(echo "${session_id}" | cut -c1-8).jsonl"
tail -c "+$((last_offset + 1))" "${transcript}" | jq -c -R '
fromjson?
| select(.type == "user" or .type == "assistant")
| {
type: .type,
timestamp: .timestamp,
text: (
if .message.content | type == "string" then .message.content
elif .message.content | type == "array" then
(.message.content | map(
if .type == "text" then .text
elif .type == "tool_use" then "[tool_use:" + .name + "] " + (.input | tostring | .[0:2000])
elif .type == "tool_result" then "[tool_result] " + (.content | tostring | .[0:2000])
else ""
end
) | join("\n"))
else ""
end
)
}
' > "${tmp_jsonl}" 2>/dev/null
if [ ! -s "${tmp_jsonl}" ]; then
events_json="[]"
else
events_json="$(jq -s '.' "${tmp_jsonl}")"
fi
rm -f "${tmp_jsonl}"
# Scrub secrets in the joined text fields. We do it as a final pass
# over the JSON string for simplicity β€” sed runs over each pattern.
for pat in "${SCRUB_PATTERNS[@]}"; do
events_json="$(printf '%s' "${events_json}" | sed -E "${pat}")"
done
event_count="$(printf '%s' "${events_json}" | jq 'length')"
total_events=$((total_events + event_count))
if [ "${event_count}" -gt 0 ]; then
# Chunk size 500 (was 50) β€” HF rate-limits commits to 128/hr per
# repo, and prior chunk=50 produced ~10x more commits than needed.
# 2026-05-05 sync hit 429 after 70 commits with chunk=50.
# 500 events β‰ˆ ~1MB payload, well inside Vercel's 4.5MB body limit.
# Plus 30s sleep between POSTs so 30 sessions Γ— ~5 batches each
# spread across ~75 min β€” comfortably under the 128/hr ceiling.
offset=0
while [ "${offset}" -lt "${event_count}" ]; do
if [ "${first_post}" = "false" ] && [ "${DRY_RUN}" != "true" ]; then
sleep 30
fi
chunk="$(printf '%s' "${events_json}" | jq ".[${offset}:${offset}+500]")"
post_batch "${repo}" "${session_id}" "${chunk}"
offset=$((offset + 500))
first_post=false
done
fi
# Update cursor to current EOF β€” even if nothing landed, we mark
# this section processed so the next run starts past it.
if [ "${DRY_RUN}" != "true" ]; then
printf '%s' "${file_size}" > "${cursor_file}"
fi
done
done
log "scan complete: repos=${total_repos} sessions=${total_sessions} events=${total_events}"