File size: 9,398 Bytes
5e21013 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 | #!/usr/bin/env bash
# scripts/learning/sync-claude-sessions.sh
#
# Read Claude Code session transcripts from ~/.claude/projects/<repo>/*.jsonl
# for ALL CUI Labs monorepos, scrub secrets, batch events, POST to
# /api/learning/ingest. Tracks last-pushed cursor per session so it
# only sends NEW lines on each tick.
#
# Christopher's directive 2026-05-05: "everything is a dataset that you
# use - so let bee use it as well." This script is the local-machine
# half of that directive β Vercel can't read from his disk; this can.
#
# Usage:
# ./scripts/learning/sync-claude-sessions.sh # one-shot
# ./scripts/learning/sync-claude-sessions.sh --dry-run # preview only
# ./scripts/learning/sync-claude-sessions.sh --since=N # last N hours
#
# Requires:
# - jq, curl, sed
# - BEE_API_URL + CRON_SECRET in .env (or environment)
#
# Designed to run periodically via launchd or cron. Idempotent β re-runs
# only push lines past the cursor.
set -euo pipefail
DRY_RUN=false
SINCE_HOURS=24
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
--since=*) SINCE_HOURS="${arg#--since=}" ;;
*) echo "WARN: unknown flag '$arg'" >&2 ;;
esac
done
REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
ENV_FILE="${REPO_ROOT}/.env"
if [ -f "${ENV_FILE}" ]; then
set -a
# shellcheck disable=SC1090
source "${ENV_FILE}"
set +a
fi
API_URL="${BEE_PORTAL_URL:-https://bee.cuilabs.io}"
SECRET="${CRON_SECRET:-}"
if [ -z "${SECRET}" ]; then
echo "FATAL: CRON_SECRET not set (check ${ENV_FILE} or env)" >&2
exit 1
fi
# NOTE: BEE_API_URL in .env points at the Modal/HF Python backend
# (model inference), NOT the Next.js portal where /api/learning/ingest
# lives. Use BEE_PORTAL_URL to override; default is production Vercel.
CLAUDE_ROOT="${HOME}/.claude/projects"
if [ ! -d "${CLAUDE_ROOT}" ]; then
echo "FATAL: ${CLAUDE_ROOT} not found β Claude Code not installed?" >&2
exit 1
fi
CURSOR_DIR="${HOME}/.claude/bee-sync"
mkdir -p "${CURSOR_DIR}"
log() {
echo "[$(date -u +%H:%M:%S)] $*"
}
# Secrets we never want shipped to a public dataset. The list is
# deliberately broader than strictly necessary β better to lose a real
# token than to leak one.
SCRUB_PATTERNS=(
's/sk-[a-zA-Z0-9_-]{20,}/sk-REDACTED/g'
's/AKIA[0-9A-Z]{16}/AKIA-REDACTED/g'
's/(api[_-]?key|API[_-]?KEY)[":= ]+[a-zA-Z0-9_-]{16,}/\1=REDACTED/g'
's/Bearer [a-zA-Z0-9._-]{16,}/Bearer REDACTED/g'
's/(ghp|gho|ghu|ghs|ghr)_[a-zA-Z0-9]{36,}/gh-REDACTED/g'
's/hf_[a-zA-Z0-9]{32,}/hf_REDACTED/g'
's/xox[bpoa]-[a-zA-Z0-9-]{10,}/slack-REDACTED/g'
)
scrub() {
local content="$1"
for pat in "${SCRUB_PATTERNS[@]}"; do
content="$(printf '%s' "${content}" | sed -E "${pat}")"
done
printf '%s' "${content}"
}
# Decode Claude's encoded-path directory naming back to a repo name:
# "-Users-christopherfrost-Desktop-Bee" β "Bee"
decode_repo() {
local enc="$1"
basename "${enc//-//}"
}
post_batch() {
local repo="$1"
local session_id="$2"
local events_json="$3"
local count
count="$(printf '%s' "${events_json}" | jq 'length')"
if [ "${count}" -eq 0 ]; then
return 0
fi
local payload
payload="$(jq -n \
--arg src "claude_session" \
--arg sid "${session_id}" \
--arg repo "${repo}" \
--argjson events "${events_json}" \
'{source: $src, session_id: $sid, repo: $repo, events: $events}')"
if [ "${DRY_RUN}" = "true" ]; then
log "DRY-RUN: would POST ${count} events for ${repo} session=${session_id:0:8}"
return 0
fi
local resp
if ! resp="$(curl -sS -X POST "${API_URL}/api/learning/ingest" \
-H "Authorization: Bearer ${SECRET}" \
-H "Content-Type: application/json" \
--data "${payload}" \
--max-time 60)"; then
log "ERR: curl failed posting batch for ${repo} session=${session_id:0:8}"
return 1
fi
local status
status="$(printf '%s' "${resp}" | jq -r '.status // "error"')"
if [ "${status}" = "ok" ]; then
local samples upload_path
samples="$(printf '%s' "${resp}" | jq -r '.samples')"
upload_path="$(printf '%s' "${resp}" | jq -r '.upload_path')"
log "OK: ${repo} session=${session_id:0:8} β ${samples} samples in ${upload_path}"
else
# Surface the raw API response so debugging is auditable. Jq
# extraction of .error // "unknown" was hiding the real cause.
local err raw_resp
if ! err="$(printf '%s' "${resp}" | jq -r '.error // empty' 2>/dev/null)"; then
err=""
fi
if [ -z "${err}" ]; then
raw_resp="$(printf '%s' "${resp}" | head -c 300)"
err="raw_response: ${raw_resp}"
fi
log "ERR: ${repo} session=${session_id:0:8} β ${err}"
fi
}
# Iterate every monorepo project Claude has tracked, every session.
# Per-session cursor file stores the last byte offset we pushed.
total_repos=0
total_sessions=0
total_events=0
# Global "first post in this script run" flag β so the 30 s throttle
# applies between EVERY POST, not just chunk-to-chunk within a single
# session. 2026-05-05 incident: a per-session flag let the first chunk
# of every session fire immediately, producing 9+ commits in <10 s
# (yesterday's 23:35:17β23:35:27 burst), which blew through HF's
# 128/hr commit ceiling and turned the rest of the run into 502s
# (the 54 % error rate Vercel reported on /api/learning/ingest).
first_post=true
for proj_dir in "${CLAUDE_ROOT}"/*/; do
proj="$(basename "${proj_dir%/}")"
repo="$(decode_repo "${proj}")"
total_repos=$((total_repos + 1))
for transcript in "${proj_dir}"*.jsonl; do
[ -e "${transcript}" ] || continue
session_id="$(basename "${transcript}" .jsonl)"
cursor_file="${CURSOR_DIR}/${proj}__${session_id}.cursor"
last_offset=0
if [ -f "${cursor_file}" ]; then
last_offset="$(cat "${cursor_file}")"
fi
file_size="$(stat -f %z "${transcript}" 2>/dev/null || stat -c %s "${transcript}")"
if [ "${last_offset}" -ge "${file_size}" ]; then
continue
fi
# Cutoff: skip files older than --since= hours since last modify.
file_mtime="$(stat -f %m "${transcript}" 2>/dev/null || stat -c %Y "${transcript}")"
now_epoch="$(date +%s)"
age_hours=$(( (now_epoch - file_mtime) / 3600 ))
if [ "${age_hours}" -gt "${SINCE_HOURS}" ]; then
continue
fi
total_sessions=$((total_sessions + 1))
# Read new lines since cursor. Each line is a Claude session event.
# We parse LINE BY LINE (not jq -s) because Claude transcripts can
# legitimately contain literal control characters inside text fields
# β `jq -s` aborts on the first bad line and loses the whole batch.
# `jq -c -R 'fromjson? | ...'` skips bad lines silently and emits
# one compact line per good event, which we then bundle into a JSON
# array via the final `jq -s`. Scrub secrets downstream.
tmp_jsonl="/tmp/bee-sync-events-$$-$(echo "${session_id}" | cut -c1-8).jsonl"
tail -c "+$((last_offset + 1))" "${transcript}" | jq -c -R '
fromjson?
| select(.type == "user" or .type == "assistant")
| {
type: .type,
timestamp: .timestamp,
text: (
if .message.content | type == "string" then .message.content
elif .message.content | type == "array" then
(.message.content | map(
if .type == "text" then .text
elif .type == "tool_use" then "[tool_use:" + .name + "] " + (.input | tostring | .[0:2000])
elif .type == "tool_result" then "[tool_result] " + (.content | tostring | .[0:2000])
else ""
end
) | join("\n"))
else ""
end
)
}
' > "${tmp_jsonl}" 2>/dev/null
if [ ! -s "${tmp_jsonl}" ]; then
events_json="[]"
else
events_json="$(jq -s '.' "${tmp_jsonl}")"
fi
rm -f "${tmp_jsonl}"
# Scrub secrets in the joined text fields. We do it as a final pass
# over the JSON string for simplicity β sed runs over each pattern.
for pat in "${SCRUB_PATTERNS[@]}"; do
events_json="$(printf '%s' "${events_json}" | sed -E "${pat}")"
done
event_count="$(printf '%s' "${events_json}" | jq 'length')"
total_events=$((total_events + event_count))
if [ "${event_count}" -gt 0 ]; then
# Chunk size 500 (was 50) β HF rate-limits commits to 128/hr per
# repo, and prior chunk=50 produced ~10x more commits than needed.
# 2026-05-05 sync hit 429 after 70 commits with chunk=50.
# 500 events β ~1MB payload, well inside Vercel's 4.5MB body limit.
# Plus 30s sleep between POSTs so 30 sessions Γ ~5 batches each
# spread across ~75 min β comfortably under the 128/hr ceiling.
offset=0
while [ "${offset}" -lt "${event_count}" ]; do
if [ "${first_post}" = "false" ] && [ "${DRY_RUN}" != "true" ]; then
sleep 30
fi
chunk="$(printf '%s' "${events_json}" | jq ".[${offset}:${offset}+500]")"
post_batch "${repo}" "${session_id}" "${chunk}"
offset=$((offset + 500))
first_post=false
done
fi
# Update cursor to current EOF β even if nothing landed, we mark
# this section processed so the next run starts past it.
if [ "${DRY_RUN}" != "true" ]; then
printf '%s' "${file_size}" > "${cursor_file}"
fi
done
done
log "scan complete: repos=${total_repos} sessions=${total_sessions} events=${total_events}"
|