| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| DRY_RUN=false |
| SINCE_HOURS=24 |
| for arg in "$@"; do |
| case "$arg" in |
| --dry-run) DRY_RUN=true ;; |
| --since=*) SINCE_HOURS="${arg#--since=}" ;; |
| *) echo "WARN: unknown flag '$arg'" >&2 ;; |
| esac |
| done |
|
|
| REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" |
| ENV_FILE="${REPO_ROOT}/.env" |
| if [ -f "${ENV_FILE}" ]; then |
| set -a |
| |
| source "${ENV_FILE}" |
| set +a |
| fi |
|
|
| API_URL="${BEE_PORTAL_URL:-https://bee.cuilabs.io}" |
| SECRET="${CRON_SECRET:-}" |
| if [ -z "${SECRET}" ]; then |
| echo "FATAL: CRON_SECRET not set (check ${ENV_FILE} or env)" >&2 |
| exit 1 |
| fi |
| |
| |
| |
|
|
| CLAUDE_ROOT="${HOME}/.claude/projects" |
| if [ ! -d "${CLAUDE_ROOT}" ]; then |
| echo "FATAL: ${CLAUDE_ROOT} not found β Claude Code not installed?" >&2 |
| exit 1 |
| fi |
|
|
| CURSOR_DIR="${HOME}/.claude/bee-sync" |
| mkdir -p "${CURSOR_DIR}" |
|
|
| log() { |
| echo "[$(date -u +%H:%M:%S)] $*" |
| } |
|
|
| |
| |
| |
| SCRUB_PATTERNS=( |
| 's/sk-[a-zA-Z0-9_-]{20,}/sk-REDACTED/g' |
| 's/AKIA[0-9A-Z]{16}/AKIA-REDACTED/g' |
| 's/(api[_-]?key|API[_-]?KEY)[":= ]+[a-zA-Z0-9_-]{16,}/\1=REDACTED/g' |
| 's/Bearer [a-zA-Z0-9._-]{16,}/Bearer REDACTED/g' |
| 's/(ghp|gho|ghu|ghs|ghr)_[a-zA-Z0-9]{36,}/gh-REDACTED/g' |
| 's/hf_[a-zA-Z0-9]{32,}/hf_REDACTED/g' |
| 's/xox[bpoa]-[a-zA-Z0-9-]{10,}/slack-REDACTED/g' |
| ) |
|
|
| scrub() { |
| local content="$1" |
| for pat in "${SCRUB_PATTERNS[@]}"; do |
| content="$(printf '%s' "${content}" | sed -E "${pat}")" |
| done |
| printf '%s' "${content}" |
| } |
|
|
| |
| |
| decode_repo() { |
| local enc="$1" |
| basename "${enc//-//}" |
| } |
|
|
| post_batch() { |
| local repo="$1" |
| local session_id="$2" |
| local events_json="$3" |
| local count |
| count="$(printf '%s' "${events_json}" | jq 'length')" |
| if [ "${count}" -eq 0 ]; then |
| return 0 |
| fi |
| local payload |
| payload="$(jq -n \ |
| --arg src "claude_session" \ |
| --arg sid "${session_id}" \ |
| --arg repo "${repo}" \ |
| --argjson events "${events_json}" \ |
| '{source: $src, session_id: $sid, repo: $repo, events: $events}')" |
| if [ "${DRY_RUN}" = "true" ]; then |
| log "DRY-RUN: would POST ${count} events for ${repo} session=${session_id:0:8}" |
| return 0 |
| fi |
| local resp |
| if ! resp="$(curl -sS -X POST "${API_URL}/api/learning/ingest" \ |
| -H "Authorization: Bearer ${SECRET}" \ |
| -H "Content-Type: application/json" \ |
| --data "${payload}" \ |
| --max-time 60)"; then |
| log "ERR: curl failed posting batch for ${repo} session=${session_id:0:8}" |
| return 1 |
| fi |
| local status |
| status="$(printf '%s' "${resp}" | jq -r '.status // "error"')" |
| if [ "${status}" = "ok" ]; then |
| local samples upload_path |
| samples="$(printf '%s' "${resp}" | jq -r '.samples')" |
| upload_path="$(printf '%s' "${resp}" | jq -r '.upload_path')" |
| log "OK: ${repo} session=${session_id:0:8} β ${samples} samples in ${upload_path}" |
| else |
| |
| |
| local err raw_resp |
| if ! err="$(printf '%s' "${resp}" | jq -r '.error // empty' 2>/dev/null)"; then |
| err="" |
| fi |
| if [ -z "${err}" ]; then |
| raw_resp="$(printf '%s' "${resp}" | head -c 300)" |
| err="raw_response: ${raw_resp}" |
| fi |
| log "ERR: ${repo} session=${session_id:0:8} β ${err}" |
| fi |
| } |
|
|
| |
| |
| total_repos=0 |
| total_sessions=0 |
| total_events=0 |
| |
| |
| |
| |
| |
| |
| |
| first_post=true |
|
|
| for proj_dir in "${CLAUDE_ROOT}"/*/; do |
| proj="$(basename "${proj_dir%/}")" |
| repo="$(decode_repo "${proj}")" |
| total_repos=$((total_repos + 1)) |
| for transcript in "${proj_dir}"*.jsonl; do |
| [ -e "${transcript}" ] || continue |
| session_id="$(basename "${transcript}" .jsonl)" |
| cursor_file="${CURSOR_DIR}/${proj}__${session_id}.cursor" |
| last_offset=0 |
| if [ -f "${cursor_file}" ]; then |
| last_offset="$(cat "${cursor_file}")" |
| fi |
| file_size="$(stat -f %z "${transcript}" 2>/dev/null || stat -c %s "${transcript}")" |
| if [ "${last_offset}" -ge "${file_size}" ]; then |
| continue |
| fi |
|
|
| |
| file_mtime="$(stat -f %m "${transcript}" 2>/dev/null || stat -c %Y "${transcript}")" |
| now_epoch="$(date +%s)" |
| age_hours=$(( (now_epoch - file_mtime) / 3600 )) |
| if [ "${age_hours}" -gt "${SINCE_HOURS}" ]; then |
| continue |
| fi |
|
|
| total_sessions=$((total_sessions + 1)) |
| |
| |
| |
| |
| |
| |
| |
| tmp_jsonl="/tmp/bee-sync-events-$$-$(echo "${session_id}" | cut -c1-8).jsonl" |
| tail -c "+$((last_offset + 1))" "${transcript}" | jq -c -R ' |
| fromjson? |
| | select(.type == "user" or .type == "assistant") |
| | { |
| type: .type, |
| timestamp: .timestamp, |
| text: ( |
| if .message.content | type == "string" then .message.content |
| elif .message.content | type == "array" then |
| (.message.content | map( |
| if .type == "text" then .text |
| elif .type == "tool_use" then "[tool_use:" + .name + "] " + (.input | tostring | .[0:2000]) |
| elif .type == "tool_result" then "[tool_result] " + (.content | tostring | .[0:2000]) |
| else "" |
| end |
| ) | join("\n")) |
| else "" |
| end |
| ) |
| } |
| ' > "${tmp_jsonl}" 2>/dev/null |
| if [ ! -s "${tmp_jsonl}" ]; then |
| events_json="[]" |
| else |
| events_json="$(jq -s '.' "${tmp_jsonl}")" |
| fi |
| rm -f "${tmp_jsonl}" |
|
|
| |
| |
| for pat in "${SCRUB_PATTERNS[@]}"; do |
| events_json="$(printf '%s' "${events_json}" | sed -E "${pat}")" |
| done |
|
|
| event_count="$(printf '%s' "${events_json}" | jq 'length')" |
| total_events=$((total_events + event_count)) |
|
|
| if [ "${event_count}" -gt 0 ]; then |
| |
| |
| |
| |
| |
| |
| offset=0 |
| while [ "${offset}" -lt "${event_count}" ]; do |
| if [ "${first_post}" = "false" ] && [ "${DRY_RUN}" != "true" ]; then |
| sleep 30 |
| fi |
| chunk="$(printf '%s' "${events_json}" | jq ".[${offset}:${offset}+500]")" |
| post_batch "${repo}" "${session_id}" "${chunk}" |
| offset=$((offset + 500)) |
| first_post=false |
| done |
| fi |
|
|
| |
| |
| if [ "${DRY_RUN}" != "true" ]; then |
| printf '%s' "${file_size}" > "${cursor_file}" |
| fi |
| done |
| done |
|
|
| log "scan complete: repos=${total_repos} sessions=${total_sessions} events=${total_events}" |
|
|