Spaces:

cuilabs
/

bee

Paused

File size: 9,398 Bytes

5e21013

#!/usr/bin/env bash
# scripts/learning/sync-claude-sessions.sh
#
# Read Claude Code session transcripts from ~/.claude/projects/<repo>/*.jsonl
# for ALL CUI Labs monorepos, scrub secrets, batch events, POST to
# /api/learning/ingest. Tracks last-pushed cursor per session so it
# only sends NEW lines on each tick.
#
# Christopher's directive 2026-05-05: "everything is a dataset that you
# use - so let bee use it as well." This script is the local-machine
# half of that directive — Vercel can't read from his disk; this can.
#
# Usage:
#   ./scripts/learning/sync-claude-sessions.sh             # one-shot
#   ./scripts/learning/sync-claude-sessions.sh --dry-run   # preview only
#   ./scripts/learning/sync-claude-sessions.sh --since=N   # last N hours
#
# Requires:
#   - jq, curl, sed
#   - BEE_API_URL + CRON_SECRET in .env (or environment)
#
# Designed to run periodically via launchd or cron. Idempotent — re-runs
# only push lines past the cursor.

set -euo pipefail

DRY_RUN=false
SINCE_HOURS=24
for arg in "$@"; do
  case "$arg" in
    --dry-run) DRY_RUN=true ;;
    --since=*) SINCE_HOURS="${arg#--since=}" ;;
    *) echo "WARN: unknown flag '$arg'" >&2 ;;
  esac
done

REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
ENV_FILE="${REPO_ROOT}/.env"
if [ -f "${ENV_FILE}" ]; then
  set -a
  # shellcheck disable=SC1090
  source "${ENV_FILE}"
  set +a
fi

API_URL="${BEE_PORTAL_URL:-https://bee.cuilabs.io}"
SECRET="${CRON_SECRET:-}"
if [ -z "${SECRET}" ]; then
  echo "FATAL: CRON_SECRET not set (check ${ENV_FILE} or env)" >&2
  exit 1
fi
# NOTE: BEE_API_URL in .env points at the Modal/HF Python backend
# (model inference), NOT the Next.js portal where /api/learning/ingest
# lives. Use BEE_PORTAL_URL to override; default is production Vercel.

CLAUDE_ROOT="${HOME}/.claude/projects"
if [ ! -d "${CLAUDE_ROOT}" ]; then
  echo "FATAL: ${CLAUDE_ROOT} not found — Claude Code not installed?" >&2
  exit 1
fi

CURSOR_DIR="${HOME}/.claude/bee-sync"
mkdir -p "${CURSOR_DIR}"

log() {
  echo "[$(date -u +%H:%M:%S)] $*"
}

# Secrets we never want shipped to a public dataset. The list is
# deliberately broader than strictly necessary — better to lose a real
# token than to leak one.
SCRUB_PATTERNS=(
  's/sk-[a-zA-Z0-9_-]{20,}/sk-REDACTED/g'
  's/AKIA[0-9A-Z]{16}/AKIA-REDACTED/g'
  's/(api[_-]?key|API[_-]?KEY)[":= ]+[a-zA-Z0-9_-]{16,}/\1=REDACTED/g'
  's/Bearer [a-zA-Z0-9._-]{16,}/Bearer REDACTED/g'
  's/(ghp|gho|ghu|ghs|ghr)_[a-zA-Z0-9]{36,}/gh-REDACTED/g'
  's/hf_[a-zA-Z0-9]{32,}/hf_REDACTED/g'
  's/xox[bpoa]-[a-zA-Z0-9-]{10,}/slack-REDACTED/g'
)

scrub() {
  local content="$1"
  for pat in "${SCRUB_PATTERNS[@]}"; do
    content="$(printf '%s' "${content}" | sed -E "${pat}")"
  done
  printf '%s' "${content}"
}

# Decode Claude's encoded-path directory naming back to a repo name:
# "-Users-christopherfrost-Desktop-Bee" → "Bee"
decode_repo() {
  local enc="$1"
  basename "${enc//-//}"
}

post_batch() {
  local repo="$1"
  local session_id="$2"
  local events_json="$3"
  local count
  count="$(printf '%s' "${events_json}" | jq 'length')"
  if [ "${count}" -eq 0 ]; then
    return 0
  fi
  local payload
  payload="$(jq -n \
    --arg src "claude_session" \
    --arg sid "${session_id}" \
    --arg repo "${repo}" \
    --argjson events "${events_json}" \
    '{source: $src, session_id: $sid, repo: $repo, events: $events}')"
  if [ "${DRY_RUN}" = "true" ]; then
    log "DRY-RUN: would POST ${count} events for ${repo} session=${session_id:0:8}"
    return 0
  fi
  local resp
  if ! resp="$(curl -sS -X POST "${API_URL}/api/learning/ingest" \
    -H "Authorization: Bearer ${SECRET}" \
    -H "Content-Type: application/json" \
    --data "${payload}" \
    --max-time 60)"; then
    log "ERR: curl failed posting batch for ${repo} session=${session_id:0:8}"
    return 1
  fi
  local status
  status="$(printf '%s' "${resp}" | jq -r '.status // "error"')"
  if [ "${status}" = "ok" ]; then
    local samples upload_path
    samples="$(printf '%s' "${resp}" | jq -r '.samples')"
    upload_path="$(printf '%s' "${resp}" | jq -r '.upload_path')"
    log "OK: ${repo} session=${session_id:0:8} → ${samples} samples in ${upload_path}"
  else
    # Surface the raw API response so debugging is auditable. Jq
    # extraction of .error // "unknown" was hiding the real cause.
    local err raw_resp
    if ! err="$(printf '%s' "${resp}" | jq -r '.error // empty' 2>/dev/null)"; then
      err=""
    fi
    if [ -z "${err}" ]; then
      raw_resp="$(printf '%s' "${resp}" | head -c 300)"
      err="raw_response: ${raw_resp}"
    fi
    log "ERR: ${repo} session=${session_id:0:8} → ${err}"
  fi
}

# Iterate every monorepo project Claude has tracked, every session.
# Per-session cursor file stores the last byte offset we pushed.
total_repos=0
total_sessions=0
total_events=0
# Global "first post in this script run" flag — so the 30 s throttle
# applies between EVERY POST, not just chunk-to-chunk within a single
# session. 2026-05-05 incident: a per-session flag let the first chunk
# of every session fire immediately, producing 9+ commits in <10 s
# (yesterday's 23:35:17–23:35:27 burst), which blew through HF's
# 128/hr commit ceiling and turned the rest of the run into 502s
# (the 54 % error rate Vercel reported on /api/learning/ingest).
first_post=true

for proj_dir in "${CLAUDE_ROOT}"/*/; do
  proj="$(basename "${proj_dir%/}")"
  repo="$(decode_repo "${proj}")"
  total_repos=$((total_repos + 1))
  for transcript in "${proj_dir}"*.jsonl; do
    [ -e "${transcript}" ] || continue
    session_id="$(basename "${transcript}" .jsonl)"
    cursor_file="${CURSOR_DIR}/${proj}__${session_id}.cursor"
    last_offset=0
    if [ -f "${cursor_file}" ]; then
      last_offset="$(cat "${cursor_file}")"
    fi
    file_size="$(stat -f %z "${transcript}" 2>/dev/null || stat -c %s "${transcript}")"
    if [ "${last_offset}" -ge "${file_size}" ]; then
      continue
    fi

    # Cutoff: skip files older than --since= hours since last modify.
    file_mtime="$(stat -f %m "${transcript}" 2>/dev/null || stat -c %Y "${transcript}")"
    now_epoch="$(date +%s)"
    age_hours=$(( (now_epoch - file_mtime) / 3600 ))
    if [ "${age_hours}" -gt "${SINCE_HOURS}" ]; then
      continue
    fi

    total_sessions=$((total_sessions + 1))
    # Read new lines since cursor. Each line is a Claude session event.
    # We parse LINE BY LINE (not jq -s) because Claude transcripts can
    # legitimately contain literal control characters inside text fields
    # — `jq -s` aborts on the first bad line and loses the whole batch.
    # `jq -c -R 'fromjson? | ...'` skips bad lines silently and emits
    # one compact line per good event, which we then bundle into a JSON
    # array via the final `jq -s`. Scrub secrets downstream.
    tmp_jsonl="/tmp/bee-sync-events-$$-$(echo "${session_id}" | cut -c1-8).jsonl"
    tail -c "+$((last_offset + 1))" "${transcript}" | jq -c -R '
      fromjson?
      | select(.type == "user" or .type == "assistant")
      | {
          type: .type,
          timestamp: .timestamp,
          text: (
            if .message.content | type == "string" then .message.content
            elif .message.content | type == "array" then
              (.message.content | map(
                if .type == "text" then .text
                elif .type == "tool_use" then "[tool_use:" + .name + "] " + (.input | tostring | .[0:2000])
                elif .type == "tool_result" then "[tool_result] " + (.content | tostring | .[0:2000])
                else ""
                end
              ) | join("\n"))
            else ""
            end
          )
        }
    ' > "${tmp_jsonl}" 2>/dev/null
    if [ ! -s "${tmp_jsonl}" ]; then
      events_json="[]"
    else
      events_json="$(jq -s '.' "${tmp_jsonl}")"
    fi
    rm -f "${tmp_jsonl}"

    # Scrub secrets in the joined text fields. We do it as a final pass
    # over the JSON string for simplicity — sed runs over each pattern.
    for pat in "${SCRUB_PATTERNS[@]}"; do
      events_json="$(printf '%s' "${events_json}" | sed -E "${pat}")"
    done

    event_count="$(printf '%s' "${events_json}" | jq 'length')"
    total_events=$((total_events + event_count))

    if [ "${event_count}" -gt 0 ]; then
      # Chunk size 500 (was 50) — HF rate-limits commits to 128/hr per
      # repo, and prior chunk=50 produced ~10x more commits than needed.
      # 2026-05-05 sync hit 429 after 70 commits with chunk=50.
      # 500 events ≈ ~1MB payload, well inside Vercel's 4.5MB body limit.
      # Plus 30s sleep between POSTs so 30 sessions × ~5 batches each
      # spread across ~75 min — comfortably under the 128/hr ceiling.
      offset=0
      while [ "${offset}" -lt "${event_count}" ]; do
        if [ "${first_post}" = "false" ] && [ "${DRY_RUN}" != "true" ]; then
          sleep 30
        fi
        chunk="$(printf '%s' "${events_json}" | jq ".[${offset}:${offset}+500]")"
        post_batch "${repo}" "${session_id}" "${chunk}"
        offset=$((offset + 500))
        first_post=false
      done
    fi

    # Update cursor to current EOF — even if nothing landed, we mark
    # this section processed so the next run starts past it.
    if [ "${DRY_RUN}" != "true" ]; then
      printf '%s' "${file_size}" > "${cursor_file}"
    fi
  done
done

log "scan complete: repos=${total_repos} sessions=${total_sessions} events=${total_events}"