archon-dataset-sync / sync_dataset.sh
personalbotai
Deploy Archon Dataset Sync v2.1 with branch support\n\n- Add sync_dataset.sh with DATASET_BRANCH support\n- Add Flask monitoring dashboard (app.py)\n- Add Dockerfile for HF Space deployment\n- Add comprehensive documentation\n- Security hardening (upstream protection)\n- Auto-retry with exponential backoff\n- Health checks and graceful shutdown\n\nArchon Standard: Build for Eternity
9de9a1b
#!/bin/bash
set -euo pipefail # Strict mode: exit on error, undefined var, pipefail
# ============================================
# KONFIGURASI
# ============================================
DATASET_REPO="${DATASET_REPO:-https://github.com/personalbotai/picoclaw-memory.git}"
DATASET_BRANCH="${DATASET_BRANCH:-main}" # Branch selection (default: main)
SYNC_INTERVAL="${SYNC_INTERVAL:-300}" # seconds
MAX_RETRIES="${MAX_RETRIES:-3}"
BACKUP_RETENTION_DAYS="${BACKUP_RETENTION_DAYS:-7}"
MIN_DISK_FREE_MB="${MIN_DISK_FREE_MB:-1024}" # 1GB minimum
# ============================================
# DETERMINASI PATHS
# ============================================
if [ -z "${HOME:-}" ]; then
export HOME="/root"
fi
PICOCLAW_HOME="${PICOCLAW_HOME:-$HOME/.picoclaw}"
WORKSPACE_DIR="$PICOCLAW_HOME/workspace"
CONFIG_FILE="$PICOCLAW_HOME/config.json"
BACKUP_DIR="$PICOCLAW_HOME/backup"
LOG_FILE="$PICOCLAW_HOME/sync.log"
STATE_FILE="$PICOCLAW_HOME/sync.state"
# ============================================
# LOGGING & UTILS
# ============================================
log() {
local level="$1"
local msg="$2"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [$level] $msg" | tee -a "$LOG_FILE"
}
die() {
log "ERROR" "$1"
exit 1
}
# ============================================
# GRACEFUL SHUTDOWN
# ============================================
shutdown() {
log "INFO" "Received shutdown signal, exiting gracefully..."
rm -f "$STATE_FILE"
exit 0
}
trap shutdown SIGTERM SIGINT
# ============================================
# GIT SETUP (non-destructive)
# ============================================
setup_git() {
log "INFO" "Setting up git configuration..."
# Use local config only (not global)
mkdir -p "$BACKUP_DIR"
git --git-dir="$BACKUP_DIR/.git" config user.name "${GIT_AUTHOR_NAME:-picoclaw}"
git --git-dir="$BACKUP_DIR/.git" config user.email "${GIT_AUTHOR_EMAIL:-picoclaw@example.com}"
# Configure credential helper if token exists
if [ -n "${GITHUB_TOKEN:-}" ]; then
git config --global credential.helper store
echo "https://${GIT_AUTHOR_NAME:-picoclaw}:${GITHUB_TOKEN}@github.com" > ~/.git-credentials
fi
}
# ============================================
# HEALTH CHECKS
# ============================================
check_disk_space() {
local free_kb=$(df "$BACKUP_DIR" | awk 'NR==2 {print $4}')
local free_mb=$((free_kb / 1024))
if [ "$free_mb" -lt "$MIN_DISK_FREE_MB" ]; then
log "WARN" "Low disk space: ${free_mb}MB free (min: ${MIN_DISK_FREE_MB}MB)"
return 1
fi
return 0
}
check_workspace() {
if [ ! -d "$WORKSPACE_DIR" ]; then
log "WARN" "Workspace directory not found: $WORKSPACE_DIR"
return 1
fi
return 0
}
# ============================================
# BACKUP & RESTORE WITH RETRY
# ============================================
initial_sync() {
log "INFO" "Starting initial sync (branch: $DATASET_BRANCH)..."
# Prevent concurrent sync
if [ -f "$STATE_FILE" ]; then
local pid=$(cat "$STATE_FILE")
if kill -0 "$pid" 2>/dev/null; then
log "WARN" "Another sync process (PID $pid) is running, waiting..."
sleep 10
if [ -f "$STATE_FILE" ] && kill -0 "$(cat "$STATE_FILE")" 2>/dev/null; then
die "Another sync process still running, aborting."
fi
else
rm -f "$STATE_FILE"
fi
fi
echo $$ > "$STATE_FILE"
# Cleanup old backups
find "$BACKUP_DIR" -name "*.bak" -mtime +$BACKUP_RETENTION_DAYS -delete 2>/dev/null || true
if [ -d "$BACKUP_DIR/.git" ]; then
log "INFO" "Existing backup found, checking branch..."
cd "$BACKUP_DIR"
# Check current branch
local current_branch
current_branch=$(git branch --show-current 2>/dev/null || echo "")
if [ "$current_branch" != "$DATASET_BRANCH" ]; then
log "INFO" "Switching from branch '$current_branch' to '$DATASET_BRANCH'..."
git fetch origin "$DATASET_BRANCH" || {
log "WARN" "Fetch failed, will try fresh clone"
rm -rf "$BACKUP_DIR/.git"
}
git checkout "$DATASET_BRANCH" || {
log "WARN" "Checkout failed, will try fresh clone"
rm -rf "$BACKUP_DIR/.git"
}
fi
# Pull latest from specified branch
git pull origin "$DATASET_BRANCH" || {
log "WARN" "Pull failed, will re-clone..."
rm -rf "$BACKUP_DIR/.git"
}
fi
if [ ! -d "$BACKUP_DIR/.git" ]; then
log "INFO" "Cloning dataset repository (branch: $DATASET_BRANCH)..."
git clone --branch "$DATASET_BRANCH" "$DATASET_REPO" "$BACKUP_DIR" || {
die "Failed to clone repository"
}
fi
# Restore workspace
if [ -d "$BACKUP_DIR/workspace" ]; then
log "INFO" "Restoring workspace from backup..."
mkdir -p "$WORKSPACE_DIR"
if command -v rsync >/dev/null 2>&1; then
rsync -av --delete --exclude='.git' "$BACKUP_DIR/workspace/" "$WORKSPACE_DIR/" || {
log "WARN" "rsync failed, falling back to cp"
cp -r "$BACKUP_DIR/workspace/." "$WORKSPACE_DIR/" 2>/dev/null || true
}
else
cp -r "$BACKUP_DIR/workspace/." "$WORKSPACE_DIR/" 2>/dev/null || true
fi
fi
if [ -f "$BACKUP_DIR/config.json" ]; then
log "INFO" "Restoring config from backup..."
cp "$BACKUP_DIR/config.json" "$CONFIG_FILE" 2>/dev/null || true
fi
rm -f "$STATE_FILE"
log "INFO" "Initial sync completed (branch: $DATASET_BRANCH)."
}
# ============================================
# SYNC NOW (WITH BRANCH SUPPORT)
# ============================================
sync_now() {
log "INFO" "Starting sync operation..."
check_workspace || return 0
if ! check_disk_space; then
log "ERROR" "Insufficient disk space, skipping sync"
return 1
fi
# Backup current workspace before sync
mkdir -p "$BACKUP_DIR"
if [ -d "$WORKSPACE_DIR" ]; then
local timestamp=$(date '+%Y%m%d_%H%M%S')
local temp_backup="$BACKUP_DIR/workspace.bak_$timestamp"
if command -v rsync >/dev/null 2>&1; then
rsync -av --exclude='.git' "$WORKSPACE_DIR/" "$temp_backup/" 2>/dev/null || true
else
cp -r "$WORKSPACE_DIR/." "$temp_backup/" 2>/dev/null || true
fi
# Cleanup old backups
find "$BACKUP_DIR" -name "workspace.bak_*" -mtime +1 -delete 2>/dev/null || true
fi
# Sync to backup directory
mkdir -p "$BACKUP_DIR/workspace"
if command -v rsync >/dev/null 2>&1; then
rsync -av --delete --exclude='.git' "$WORKSPACE_DIR/" "$BACKUP_DIR/workspace/" || {
log "ERROR" "rsync failed"
return 1
}
else
rm -rf "$BACKUP_DIR/workspace"
cp -r "$WORKSPACE_DIR" "$BACKUP_DIR/workspace" 2>/dev/null || {
log "ERROR" "cp failed"
return 1
}
fi
if [ -f "$CONFIG_FILE" ]; then
cp "$CONFIG_FILE" "$BACKUP_DIR/config.json" 2>/dev/null || true
fi
# Git operations
if [ ! -d "$BACKUP_DIR/.git" ]; then
log "WARN" "Backup not a git repo, skipping commit"
return 0
fi
cd "$BACKUP_DIR"
# Ensure we're on the correct branch
local current_branch
current_branch=$(git branch --show-current 2>/dev/null || echo "")
if [ "$current_branch" != "$DATASET_BRANCH" ]; then
log "INFO" "Switching to branch '$DATASET_BRANCH'..."
git checkout "$DATASET_BRANCH" || {
log "ERROR" "Failed to checkout branch $DATASET_BRANCH"
return 1
}
fi
# Security check: prevent pushing to upstream
local remote_url
remote_url=$(git remote get-url origin 2>/dev/null || echo "")
if echo "$remote_url" | grep -q "sipeed/picoclaw"; then
log "ERROR" "SECURITY: Attempted to push to upstream (sipeed/picoclaw), aborting!"
return 1
fi
# Check for changes
if [[ -n $(git status -s) ]]; then
git add -A
# Avoid empty commits
if [[ -n $(git status -s) ]]; then
git commit -m "Auto-sync: $(date '+%Y-%m-%d %H:%M:%S')" || {
log "WARN" "Commit failed (possibly empty)"
return 0
}
# Push to specific branch
for i in $(seq 1 $MAX_RETRIES); do
if git push origin "$DATASET_BRANCH"; then
log "INFO" "Sync completed and pushed to branch '$DATASET_BRANCH' successfully"
break
else
log "WARN" "Push to branch '$DATASET_BRANCH' failed (attempt $i/$MAX_RETRIES), retrying..."
sleep $((i * 5))
fi
done
if [ $i -eq $MAX_RETRIES ]; then
log "ERROR" "Push to branch '$DATASET_BRANCH' failed after $MAX_RETRIES attempts"
return 1
fi
else
log "INFO" "No changes to commit"
fi
else
log "INFO" "No changes detected"
fi
return 0
}
# ============================================
# MAIN LOOP
# ============================================
main() {
log "INFO" "=== PicoClaw Dataset Sync Daemon v2.1 (Branch Support) ==="
log "INFO" "Repository: $DATASET_REPO"
log "INFO" "Branch: $DATASET_BRANCH"
log "INFO" "Backup dir: $BACKUP_DIR"
log "INFO" "Workspace: $WORKSPACE_DIR"
log "INFO" "Sync interval: ${SYNC_INTERVAL}s"
setup_git
# Initial sync with retry
local attempt
for attempt in $(seq 1 $MAX_RETRIES); do
if initial_sync; then
break
else
log "WARN" "Initial sync failed (attempt $attempt/$MAX_RETRIES), retrying in 10s..."
sleep 10
fi
done
if [ $attempt -eq $MAX_RETRIES ]; then
die "Initial sync failed after $MAX_RETRIES attempts"
fi
# Main sync loop
while true; do
if sync_now; then
log "INFO" "Sync cycle completed, sleeping ${SYNC_INTERVAL}s"
else
log "WARN" "Sync cycle failed, retrying in 30s..."
sleep 30
fi
sleep "$SYNC_INTERVAL"
done
}
main "$@"