Spaces:

cuilabs
/

bee

Paused

bee / scripts /learning /sync-claude-sessions.sh

Bee Deploy

HF Space backend deploy [de0cba5]

5e21013 5 days ago

9.4 kB

	#!/usr/bin/env bash
	# scripts/learning/sync-claude-sessions.sh
	#
	# Read Claude Code session transcripts from ~/.claude/projects/<repo>/*.jsonl
	# for ALL CUI Labs monorepos, scrub secrets, batch events, POST to
	# /api/learning/ingest. Tracks last-pushed cursor per session so it
	# only sends NEW lines on each tick.
	#
	# Christopher's directive 2026-05-05: "everything is a dataset that you
	# use - so let bee use it as well." This script is the local-machine
	# half of that directive — Vercel can't read from his disk; this can.
	#
	# Usage:
	# ./scripts/learning/sync-claude-sessions.sh # one-shot
	# ./scripts/learning/sync-claude-sessions.sh --dry-run # preview only
	# ./scripts/learning/sync-claude-sessions.sh --since=N # last N hours
	#
	# Requires:
	# - jq, curl, sed
	# - BEE_API_URL + CRON_SECRET in .env (or environment)
	#
	# Designed to run periodically via launchd or cron. Idempotent — re-runs
	# only push lines past the cursor.

	set -euo pipefail

	DRY_RUN=false
	SINCE_HOURS=24
	for arg in "$@"; do
	case "$arg" in
	--dry-run) DRY_RUN=true ;;
	--since=*) SINCE_HOURS="${arg#--since=}" ;;
	*) echo "WARN: unknown flag '$arg'" >&2 ;;
	esac
	done

	REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
	ENV_FILE="${REPO_ROOT}/.env"
	if [ -f "${ENV_FILE}" ]; then
	set -a
	# shellcheck disable=SC1090
	source "${ENV_FILE}"
	set +a
	fi

	API_URL="${BEE_PORTAL_URL:-https://bee.cuilabs.io}"
	SECRET="${CRON_SECRET:-}"
	if [ -z "${SECRET}" ]; then
	echo "FATAL: CRON_SECRET not set (check ${ENV_FILE} or env)" >&2
	exit 1
	fi
	# NOTE: BEE_API_URL in .env points at the Modal/HF Python backend
	# (model inference), NOT the Next.js portal where /api/learning/ingest
	# lives. Use BEE_PORTAL_URL to override; default is production Vercel.

	CLAUDE_ROOT="${HOME}/.claude/projects"
	if [ ! -d "${CLAUDE_ROOT}" ]; then
	echo "FATAL: ${CLAUDE_ROOT} not found — Claude Code not installed?" >&2
	exit 1
	fi

	CURSOR_DIR="${HOME}/.claude/bee-sync"
	mkdir -p "${CURSOR_DIR}"

	log() {
	echo "[$(date -u +%H:%M:%S)] $*"
	}

	# Secrets we never want shipped to a public dataset. The list is
	# deliberately broader than strictly necessary — better to lose a real
	# token than to leak one.
	SCRUB_PATTERNS=(
	's/sk-[a-zA-Z0-9_-]{20,}/sk-REDACTED/g'
	's/AKIA[0-9A-Z]{16}/AKIA-REDACTED/g'
	's/(api[_-]?key\|API[_-]?KEY)[":= ]+[a-zA-Z0-9_-]{16,}/\1=REDACTED/g'
	's/Bearer [a-zA-Z0-9._-]{16,}/Bearer REDACTED/g'
	's/(ghp\|gho\|ghu\|ghs\|ghr)_[a-zA-Z0-9]{36,}/gh-REDACTED/g'
	's/hf_[a-zA-Z0-9]{32,}/hf_REDACTED/g'
	's/xox[bpoa]-[a-zA-Z0-9-]{10,}/slack-REDACTED/g'
	)

	scrub() {
	local content="$1"
	for pat in "${SCRUB_PATTERNS[@]}"; do
	content="$(printf '%s' "${content}" \| sed -E "${pat}")"
	done
	printf '%s' "${content}"
	}

	# Decode Claude's encoded-path directory naming back to a repo name:
	# "-Users-christopherfrost-Desktop-Bee" → "Bee"
	decode_repo() {
	local enc="$1"
	basename "${enc//-//}"
	}

	post_batch() {
	local repo="$1"
	local session_id="$2"
	local events_json="$3"
	local count
	count="$(printf '%s' "${events_json}" \| jq 'length')"
	if [ "${count}" -eq 0 ]; then
	return 0
	fi
	local payload
	payload="$(jq -n \
	--arg src "claude_session" \
	--arg sid "${session_id}" \
	--arg repo "${repo}" \
	--argjson events "${events_json}" \
	'{source: $src, session_id: $sid, repo: $repo, events: $events}')"
	if [ "${DRY_RUN}" = "true" ]; then
	log "DRY-RUN: would POST ${count} events for ${repo} session=${session_id:0:8}"
	return 0
	fi
	local resp
	if ! resp="$(curl -sS -X POST "${API_URL}/api/learning/ingest" \
	-H "Authorization: Bearer ${SECRET}" \
	-H "Content-Type: application/json" \
	--data "${payload}" \
	--max-time 60)"; then
	log "ERR: curl failed posting batch for ${repo} session=${session_id:0:8}"
	return 1
	fi
	local status
	status="$(printf '%s' "${resp}" \| jq -r '.status // "error"')"
	if [ "${status}" = "ok" ]; then
	local samples upload_path
	samples="$(printf '%s' "${resp}" \| jq -r '.samples')"
	upload_path="$(printf '%s' "${resp}" \| jq -r '.upload_path')"
	log "OK: ${repo} session=${session_id:0:8} → ${samples} samples in ${upload_path}"
	else
	# Surface the raw API response so debugging is auditable. Jq
	# extraction of .error // "unknown" was hiding the real cause.
	local err raw_resp
	if ! err="$(printf '%s' "${resp}" \| jq -r '.error // empty' 2>/dev/null)"; then
	err=""
	fi
	if [ -z "${err}" ]; then
	raw_resp="$(printf '%s' "${resp}" \| head -c 300)"
	err="raw_response: ${raw_resp}"
	fi
	log "ERR: ${repo} session=${session_id:0:8} → ${err}"
	fi
	}

	# Iterate every monorepo project Claude has tracked, every session.
	# Per-session cursor file stores the last byte offset we pushed.
	total_repos=0
	total_sessions=0
	total_events=0
	# Global "first post in this script run" flag — so the 30 s throttle
	# applies between EVERY POST, not just chunk-to-chunk within a single
	# session. 2026-05-05 incident: a per-session flag let the first chunk
	# of every session fire immediately, producing 9+ commits in <10 s
	# (yesterday's 23:35:17–23:35:27 burst), which blew through HF's
	# 128/hr commit ceiling and turned the rest of the run into 502s
	# (the 54 % error rate Vercel reported on /api/learning/ingest).
	first_post=true

	for proj_dir in "${CLAUDE_ROOT}"/*/; do
	proj="$(basename "${proj_dir%/}")"
	repo="$(decode_repo "${proj}")"
	total_repos=$((total_repos + 1))
	for transcript in "${proj_dir}"*.jsonl; do
	[ -e "${transcript}" ] \|\| continue
	session_id="$(basename "${transcript}" .jsonl)"
	cursor_file="${CURSOR_DIR}/${proj}__${session_id}.cursor"
	last_offset=0
	if [ -f "${cursor_file}" ]; then
	last_offset="$(cat "${cursor_file}")"
	fi
	file_size="$(stat -f %z "${transcript}" 2>/dev/null \|\| stat -c %s "${transcript}")"
	if [ "${last_offset}" -ge "${file_size}" ]; then
	continue
	fi

	# Cutoff: skip files older than --since= hours since last modify.
	file_mtime="$(stat -f %m "${transcript}" 2>/dev/null \|\| stat -c %Y "${transcript}")"
	now_epoch="$(date +%s)"
	age_hours=$(( (now_epoch - file_mtime) / 3600 ))
	if [ "${age_hours}" -gt "${SINCE_HOURS}" ]; then
	continue
	fi

	total_sessions=$((total_sessions + 1))
	# Read new lines since cursor. Each line is a Claude session event.
	# We parse LINE BY LINE (not jq -s) because Claude transcripts can
	# legitimately contain literal control characters inside text fields
	# — `jq -s` aborts on the first bad line and loses the whole batch.
	# `jq -c -R 'fromjson? \| ...'` skips bad lines silently and emits
	# one compact line per good event, which we then bundle into a JSON
	# array via the final `jq -s`. Scrub secrets downstream.
	tmp_jsonl="/tmp/bee-sync-events-$$-$(echo "${session_id}" \| cut -c1-8).jsonl"
	tail -c "+$((last_offset + 1))" "${transcript}" \| jq -c -R '
	fromjson?
	\| select(.type == "user" or .type == "assistant")
	\| {
	type: .type,
	timestamp: .timestamp,
	text: (
	if .message.content \| type == "string" then .message.content
	elif .message.content \| type == "array" then
	(.message.content \| map(
	if .type == "text" then .text
	elif .type == "tool_use" then "[tool_use:" + .name + "] " + (.input \| tostring \| .[0:2000])
	elif .type == "tool_result" then "[tool_result] " + (.content \| tostring \| .[0:2000])
	else ""
	end
	) \| join("\n"))
	else ""
	end
	)
	}
	' > "${tmp_jsonl}" 2>/dev/null
	if [ ! -s "${tmp_jsonl}" ]; then
	events_json="[]"
	else
	events_json="$(jq -s '.' "${tmp_jsonl}")"
	fi
	rm -f "${tmp_jsonl}"

	# Scrub secrets in the joined text fields. We do it as a final pass
	# over the JSON string for simplicity — sed runs over each pattern.
	for pat in "${SCRUB_PATTERNS[@]}"; do
	events_json="$(printf '%s' "${events_json}" \| sed -E "${pat}")"
	done

	event_count="$(printf '%s' "${events_json}" \| jq 'length')"
	total_events=$((total_events + event_count))

	if [ "${event_count}" -gt 0 ]; then
	# Chunk size 500 (was 50) — HF rate-limits commits to 128/hr per
	# repo, and prior chunk=50 produced ~10x more commits than needed.
	# 2026-05-05 sync hit 429 after 70 commits with chunk=50.
	# 500 events ≈ ~1MB payload, well inside Vercel's 4.5MB body limit.
	# Plus 30s sleep between POSTs so 30 sessions × ~5 batches each
	# spread across ~75 min — comfortably under the 128/hr ceiling.
	offset=0
	while [ "${offset}" -lt "${event_count}" ]; do
	if [ "${first_post}" = "false" ] && [ "${DRY_RUN}" != "true" ]; then
	sleep 30
	fi
	chunk="$(printf '%s' "${events_json}" \| jq ".[${offset}:${offset}+500]")"
	post_batch "${repo}" "${session_id}" "${chunk}"
	offset=$((offset + 500))
	first_post=false
	done
	fi

	# Update cursor to current EOF — even if nothing landed, we mark
	# this section processed so the next run starts past it.
	if [ "${DRY_RUN}" != "true" ]; then
	printf '%s' "${file_size}" > "${cursor_file}"
	fi
	done
	done

	log "scan complete: repos=${total_repos} sessions=${total_sessions} events=${total_events}"