File size: 9,398 Bytes
5e21013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/usr/bin/env bash
# scripts/learning/sync-claude-sessions.sh
#
# Read Claude Code session transcripts from ~/.claude/projects/<repo>/*.jsonl
# for ALL CUI Labs monorepos, scrub secrets, batch events, POST to
# /api/learning/ingest. Tracks last-pushed cursor per session so it
# only sends NEW lines on each tick.
#
# Christopher's directive 2026-05-05: "everything is a dataset that you
# use - so let bee use it as well." This script is the local-machine
# half of that directive β€” Vercel can't read from his disk; this can.
#
# Usage:
#   ./scripts/learning/sync-claude-sessions.sh             # one-shot
#   ./scripts/learning/sync-claude-sessions.sh --dry-run   # preview only
#   ./scripts/learning/sync-claude-sessions.sh --since=N   # last N hours
#
# Requires:
#   - jq, curl, sed
#   - BEE_API_URL + CRON_SECRET in .env (or environment)
#
# Designed to run periodically via launchd or cron. Idempotent β€” re-runs
# only push lines past the cursor.

set -euo pipefail

DRY_RUN=false
SINCE_HOURS=24
for arg in "$@"; do
  case "$arg" in
    --dry-run) DRY_RUN=true ;;
    --since=*) SINCE_HOURS="${arg#--since=}" ;;
    *) echo "WARN: unknown flag '$arg'" >&2 ;;
  esac
done

REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
ENV_FILE="${REPO_ROOT}/.env"
if [ -f "${ENV_FILE}" ]; then
  set -a
  # shellcheck disable=SC1090
  source "${ENV_FILE}"
  set +a
fi

API_URL="${BEE_PORTAL_URL:-https://bee.cuilabs.io}"
SECRET="${CRON_SECRET:-}"
if [ -z "${SECRET}" ]; then
  echo "FATAL: CRON_SECRET not set (check ${ENV_FILE} or env)" >&2
  exit 1
fi
# NOTE: BEE_API_URL in .env points at the Modal/HF Python backend
# (model inference), NOT the Next.js portal where /api/learning/ingest
# lives. Use BEE_PORTAL_URL to override; default is production Vercel.

CLAUDE_ROOT="${HOME}/.claude/projects"
if [ ! -d "${CLAUDE_ROOT}" ]; then
  echo "FATAL: ${CLAUDE_ROOT} not found β€” Claude Code not installed?" >&2
  exit 1
fi

CURSOR_DIR="${HOME}/.claude/bee-sync"
mkdir -p "${CURSOR_DIR}"

log() {
  echo "[$(date -u +%H:%M:%S)] $*"
}

# Secrets we never want shipped to a public dataset. The list is
# deliberately broader than strictly necessary β€” better to lose a real
# token than to leak one.
SCRUB_PATTERNS=(
  's/sk-[a-zA-Z0-9_-]{20,}/sk-REDACTED/g'
  's/AKIA[0-9A-Z]{16}/AKIA-REDACTED/g'
  's/(api[_-]?key|API[_-]?KEY)[":= ]+[a-zA-Z0-9_-]{16,}/\1=REDACTED/g'
  's/Bearer [a-zA-Z0-9._-]{16,}/Bearer REDACTED/g'
  's/(ghp|gho|ghu|ghs|ghr)_[a-zA-Z0-9]{36,}/gh-REDACTED/g'
  's/hf_[a-zA-Z0-9]{32,}/hf_REDACTED/g'
  's/xox[bpoa]-[a-zA-Z0-9-]{10,}/slack-REDACTED/g'
)

scrub() {
  local content="$1"
  for pat in "${SCRUB_PATTERNS[@]}"; do
    content="$(printf '%s' "${content}" | sed -E "${pat}")"
  done
  printf '%s' "${content}"
}

# Decode Claude's encoded-path directory naming back to a repo name:
# "-Users-christopherfrost-Desktop-Bee" β†’ "Bee"
decode_repo() {
  local enc="$1"
  basename "${enc//-//}"
}

post_batch() {
  local repo="$1"
  local session_id="$2"
  local events_json="$3"
  local count
  count="$(printf '%s' "${events_json}" | jq 'length')"
  if [ "${count}" -eq 0 ]; then
    return 0
  fi
  local payload
  payload="$(jq -n \
    --arg src "claude_session" \
    --arg sid "${session_id}" \
    --arg repo "${repo}" \
    --argjson events "${events_json}" \
    '{source: $src, session_id: $sid, repo: $repo, events: $events}')"
  if [ "${DRY_RUN}" = "true" ]; then
    log "DRY-RUN: would POST ${count} events for ${repo} session=${session_id:0:8}"
    return 0
  fi
  local resp
  if ! resp="$(curl -sS -X POST "${API_URL}/api/learning/ingest" \
    -H "Authorization: Bearer ${SECRET}" \
    -H "Content-Type: application/json" \
    --data "${payload}" \
    --max-time 60)"; then
    log "ERR: curl failed posting batch for ${repo} session=${session_id:0:8}"
    return 1
  fi
  local status
  status="$(printf '%s' "${resp}" | jq -r '.status // "error"')"
  if [ "${status}" = "ok" ]; then
    local samples upload_path
    samples="$(printf '%s' "${resp}" | jq -r '.samples')"
    upload_path="$(printf '%s' "${resp}" | jq -r '.upload_path')"
    log "OK: ${repo} session=${session_id:0:8} β†’ ${samples} samples in ${upload_path}"
  else
    # Surface the raw API response so debugging is auditable. Jq
    # extraction of .error // "unknown" was hiding the real cause.
    local err raw_resp
    if ! err="$(printf '%s' "${resp}" | jq -r '.error // empty' 2>/dev/null)"; then
      err=""
    fi
    if [ -z "${err}" ]; then
      raw_resp="$(printf '%s' "${resp}" | head -c 300)"
      err="raw_response: ${raw_resp}"
    fi
    log "ERR: ${repo} session=${session_id:0:8} β†’ ${err}"
  fi
}

# Iterate every monorepo project Claude has tracked, every session.
# Per-session cursor file stores the last byte offset we pushed.
total_repos=0
total_sessions=0
total_events=0
# Global "first post in this script run" flag β€” so the 30 s throttle
# applies between EVERY POST, not just chunk-to-chunk within a single
# session. 2026-05-05 incident: a per-session flag let the first chunk
# of every session fire immediately, producing 9+ commits in <10 s
# (yesterday's 23:35:17–23:35:27 burst), which blew through HF's
# 128/hr commit ceiling and turned the rest of the run into 502s
# (the 54 % error rate Vercel reported on /api/learning/ingest).
first_post=true

for proj_dir in "${CLAUDE_ROOT}"/*/; do
  proj="$(basename "${proj_dir%/}")"
  repo="$(decode_repo "${proj}")"
  total_repos=$((total_repos + 1))
  for transcript in "${proj_dir}"*.jsonl; do
    [ -e "${transcript}" ] || continue
    session_id="$(basename "${transcript}" .jsonl)"
    cursor_file="${CURSOR_DIR}/${proj}__${session_id}.cursor"
    last_offset=0
    if [ -f "${cursor_file}" ]; then
      last_offset="$(cat "${cursor_file}")"
    fi
    file_size="$(stat -f %z "${transcript}" 2>/dev/null || stat -c %s "${transcript}")"
    if [ "${last_offset}" -ge "${file_size}" ]; then
      continue
    fi

    # Cutoff: skip files older than --since= hours since last modify.
    file_mtime="$(stat -f %m "${transcript}" 2>/dev/null || stat -c %Y "${transcript}")"
    now_epoch="$(date +%s)"
    age_hours=$(( (now_epoch - file_mtime) / 3600 ))
    if [ "${age_hours}" -gt "${SINCE_HOURS}" ]; then
      continue
    fi

    total_sessions=$((total_sessions + 1))
    # Read new lines since cursor. Each line is a Claude session event.
    # We parse LINE BY LINE (not jq -s) because Claude transcripts can
    # legitimately contain literal control characters inside text fields
    # β€” `jq -s` aborts on the first bad line and loses the whole batch.
    # `jq -c -R 'fromjson? | ...'` skips bad lines silently and emits
    # one compact line per good event, which we then bundle into a JSON
    # array via the final `jq -s`. Scrub secrets downstream.
    tmp_jsonl="/tmp/bee-sync-events-$$-$(echo "${session_id}" | cut -c1-8).jsonl"
    tail -c "+$((last_offset + 1))" "${transcript}" | jq -c -R '
      fromjson?
      | select(.type == "user" or .type == "assistant")
      | {
          type: .type,
          timestamp: .timestamp,
          text: (
            if .message.content | type == "string" then .message.content
            elif .message.content | type == "array" then
              (.message.content | map(
                if .type == "text" then .text
                elif .type == "tool_use" then "[tool_use:" + .name + "] " + (.input | tostring | .[0:2000])
                elif .type == "tool_result" then "[tool_result] " + (.content | tostring | .[0:2000])
                else ""
                end
              ) | join("\n"))
            else ""
            end
          )
        }
    ' > "${tmp_jsonl}" 2>/dev/null
    if [ ! -s "${tmp_jsonl}" ]; then
      events_json="[]"
    else
      events_json="$(jq -s '.' "${tmp_jsonl}")"
    fi
    rm -f "${tmp_jsonl}"

    # Scrub secrets in the joined text fields. We do it as a final pass
    # over the JSON string for simplicity β€” sed runs over each pattern.
    for pat in "${SCRUB_PATTERNS[@]}"; do
      events_json="$(printf '%s' "${events_json}" | sed -E "${pat}")"
    done

    event_count="$(printf '%s' "${events_json}" | jq 'length')"
    total_events=$((total_events + event_count))

    if [ "${event_count}" -gt 0 ]; then
      # Chunk size 500 (was 50) β€” HF rate-limits commits to 128/hr per
      # repo, and prior chunk=50 produced ~10x more commits than needed.
      # 2026-05-05 sync hit 429 after 70 commits with chunk=50.
      # 500 events β‰ˆ ~1MB payload, well inside Vercel's 4.5MB body limit.
      # Plus 30s sleep between POSTs so 30 sessions Γ— ~5 batches each
      # spread across ~75 min β€” comfortably under the 128/hr ceiling.
      offset=0
      while [ "${offset}" -lt "${event_count}" ]; do
        if [ "${first_post}" = "false" ] && [ "${DRY_RUN}" != "true" ]; then
          sleep 30
        fi
        chunk="$(printf '%s' "${events_json}" | jq ".[${offset}:${offset}+500]")"
        post_batch "${repo}" "${session_id}" "${chunk}"
        offset=$((offset + 500))
        first_post=false
      done
    fi

    # Update cursor to current EOF β€” even if nothing landed, we mark
    # this section processed so the next run starts past it.
    if [ "${DRY_RUN}" != "true" ]; then
      printf '%s' "${file_size}" > "${cursor_file}"
    fi
  done
done

log "scan complete: repos=${total_repos} sessions=${total_sessions} events=${total_events}"