#!/usr/bin/env python3
from __future__ import annotations

import os, shutil, socket, sys, tempfile, time
from pathlib import Path

HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip()
DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingclaw-devdata"
BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup"
JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve()
INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180"))
# BUG FIX #5: Respect max file size so giant files don't stall uploads.
# Matches the 50 MB ceiling in openclaw-sync.py; override with DEVDATA_MAX_FILE_BYTES.
MAX_FILE_SIZE_BYTES = int(
    (os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024))
)
# Max stale files to delete per commit.  Mirrors openclaw-sync.py behaviour.
# Override via DEVDATA_PRUNE_BATCH_SIZE.
PRUNE_BATCH_SIZE = int((os.environ.get("DEVDATA_PRUNE_BATCH_SIZE", "").strip() or "50"))
# Reserved filenames previously generated by DevData. Never copy or restore
# them as user content, and let prune delete any old remote marker files.
RESERVED_SYNC_FILENAMES = {".huggingclaw-empty-dir"}

def is_true(value):
    return str(value).strip().lower() in {"1", "true", "yes", "on"}

ENABLE = is_true(os.environ.get("DEVDATA", "on"))


def classify_error(exc: Exception) -> str:
    msg = str(exc).lower()
    if isinstance(exc, PermissionError) or "permission denied" in msg:
        return "filesystem-permission"
    if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")):
        return "network-provider"
    if "unsafe" in msg or "malware" in msg or "security" in msg:
        return "safety-scan"
    return "general"

# Directory names that are always unsafe/noisy to persist. Keep this list to
# single path components; path-specific exclusions live in SKIP_PATH_PREFIXES.
EXCLUDE = {
    ".cache",
    "node_modules",
    ".npm",
    ".yarn",
    "Trash",            # covers .local/share/Trash and any nested trash folder
    ".ipynb_checkpoints",
    ".openclaw",
    "site-packages",
    "__pycache__",
}

# Path prefixes, relative to JUPYTER_ROOT, that should not be synced. This keeps
# package/runtime trees out of the dataset without blocking real JupyterLab
# settings under .local/share/jupyter/lab/user-settings.
ROOT_SKIP_PATH_PREFIXES = {
    ("app",),
    ("HuggingClaw",),
    ("HuggingClaw-Workspace",),
    ("browser-deps",),
}

SKIP_PATH_PREFIXES = {
    (".jupyter", "runtime"),
    (".local", "bin"),
    (".local", "lib"),
    ("lib",),
}

JUPYTER_DATA_DIR_PREFIX = (".local", "share", "jupyter")
JUPYTER_DATA_ALLOW_PREFIXES = {
    (".local", "share", "jupyter", "lab", "user-settings"),
    (".local", "share", "jupyter", "lab", "workspaces"),
}


def enabled():
    jupyter_override = os.environ.get("HUGGINGCLAW_JUPYTER_ENABLED", "")
    if jupyter_override.strip():
        dev = is_true(jupyter_override)
    else:
        dev = is_true(os.environ.get("DEV_MODE", ""))
    separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME
    if ENABLE and dev and HF_TOKEN and not separate_dataset:
        print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.")
    return ENABLE and dev and bool(HF_TOKEN) and separate_dataset

def validate_jupyter_paths() -> None:
    # JupyterLab theme/settings live under ~/.jupyter and ~/.local/share/jupyter.
    # If these are not writable, settings can appear to "reset" every restart.
    for required in (JUPYTER_ROOT, Path("/home/node/.jupyter"), Path("/home/node/.local/share/jupyter")):
        try:
            required.mkdir(parents=True, exist_ok=True)
            probe = required / ".devdata-write-check"
            probe.write_text("ok", encoding="utf-8")
            probe.unlink(missing_ok=True)
        except Exception as exc:
            kind = classify_error(exc)
            print(f"DevData warning [{kind}]: {required} is not writable; Jupyter settings may not persist ({exc})")

def repo_id(api) -> str:
    ns = HF_USERNAME
    if not ns:
        who = api.whoami()
        ns = who.get("name") or who.get("user") or ""
    if not ns:
        raise RuntimeError("Cannot resolve HF namespace for devdata sync")
    return f"{ns}/{DATASET_NAME}"

# Filename patterns that must never be synced to a public/private HF Dataset.
# These are matched against the *name* of each path component (not the full path),
# so ".env" matches /home/node/.env and /home/node/subdir/.env alike.
import fnmatch as _fnmatch

SECRET_FILENAME_PATTERNS = {
    ".env",           # dotenv files — almost always contain API keys
    ".env.*",         # .env.local, .env.production, etc.
    "id_rsa",
    "id_dsa",
    "id_ecdsa",
    "id_ed25519",
    "authorized_keys",
    "known_hosts",
    "secret",
    "secrets",
    "secret.*",
    "*.secret",
    "*_secret",
    "*_secret.*",
    "*-secret",
    "*-secret.*",
    "token",
    "token.*",
    "*.token",
    "*_token",
    "*_token.*",
    "*-token",
    "*-token.*",
    "api_token",
    "access_token",
    "refresh_token",
    "credentials",    # common credential file names
    "credentials.*",
    "auth.json",
    "auth.yaml",
    "auth.yml",
    "auth.toml",
    "auth.ini",
    "*.pem",          # TLS/SSH private keys
    "*.key",          # generic key files
    "*.p12",          # PKCS#12 bundles
    "*.pfx",
    ".netrc",         # stores plaintext passwords
    ".htpasswd",
}


def _name_is_secret(name: str) -> bool:
    """Return True if *name* matches any secret-exclusion pattern."""
    name_lower = name.lower()
    return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS)


def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool:
    return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix


def _is_prefix_of(parts: tuple[str, ...], full_path: tuple[str, ...]) -> bool:
    return len(parts) <= len(full_path) and full_path[:len(parts)] == parts


def should_skip(p: Path):
    # Reserved sync helper files are not user data. Old datasets may still have
    # these markers, but fresh snapshots no longer create them.
    if p.name in RESERVED_SYNC_FILENAMES:
        return True

    # Skip directories/files in the hard-coded exclude set.
    parts = p.parts
    if any(x in parts for x in EXCLUDE):
        return True
    if any(parts == prefix or _matches_prefix(parts, prefix) for prefix in ROOT_SKIP_PATH_PREFIXES):
        return True
    if _matches_prefix(parts, JUPYTER_DATA_DIR_PREFIX) and not any(
        _matches_prefix(parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES
    ):
        return True
    if any(_matches_prefix(parts, prefix) for prefix in SKIP_PATH_PREFIXES):
        return True
    # Skip any component whose name looks like a secret file/dir.
    return any(_name_is_secret(part) for part in parts)


def iter_sync_tree(root: Path):
    """Yield syncable DevData paths without descending into excluded trees."""
    if not root.exists():
        return

    for dirpath, dirnames, filenames in os.walk(root):
        dir_path = Path(dirpath)
        try:
            dir_rel = dir_path.relative_to(root)
        except ValueError:
            dir_rel = Path()

        kept_dirnames: list[str] = []
        for dirname in sorted(dirnames):
            rel = dir_rel / dirname
            child = dir_path / dirname
            rel_parts = rel.parts
            # Do not prune ancestors of explicitly allowed Jupyter settings
            # paths. should_skip(.local/share/jupyter) is true by design for
            # files under that tree, but we must still descend through the
            # parent dirs to reach lab/user-settings and lab/workspaces.
            allowed_ancestor = any(
                _is_prefix_of(rel_parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES
            )
            if child.is_symlink() or (should_skip(rel) and not allowed_ancestor):
                continue
            kept_dirnames.append(dirname)
        dirnames[:] = kept_dirnames

        for dirname in kept_dirnames:
            yield dir_path / dirname

        for filename in sorted(filenames):
            rel = dir_rel / filename
            child = dir_path / filename
            if child.is_symlink() or should_skip(rel):
                continue
            yield child


def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
    had_copy_failures = False
    protected_large_files: set[str] = set()
    for p in iter_sync_tree(src):
        rel = p.relative_to(src)
        target = dst / rel
        if p.is_dir():
            # Keep parent directories for files copied later in this snapshot.
            # Empty folders are intentionally not represented in the git-backed
            # HF Dataset; once a folder contains syncable files, those files
            # carry the folder path naturally.
            target.mkdir(parents=True, exist_ok=True)
        elif p.is_file():
            # BUG FIX #5: Skip files that exceed the size limit.
            try:
                if p.stat().st_size > MAX_FILE_SIZE_BYTES:
                    protected_large_files.add(rel.as_posix())
                    continue
            except OSError:
                had_copy_failures = True
                protected_large_files.add(rel.as_posix())
                continue
            target.parent.mkdir(parents=True, exist_ok=True)
            try:
                shutil.copy2(p, target)
            except OSError:
                had_copy_failures = True
                protected_large_files.add(rel.as_posix())
    return had_copy_failures, protected_large_files

def is_jupyter_running(port: int = 8888) -> bool:
    """Return True if JupyterLab is already listening on *port*.

    BUG FIX #2 (safety net): restore_once() must never run while JupyterLab
    is active.  Overwriting files under JUPYTER_ROOT (runtime/ sockets, lab/
    settings, kernel connection files) while JupyterLab is live corrupts its
    state and causes it to exit within seconds.

    The primary guard is the --restore / sync separation introduced in
    BUG FIX #3, but this TCP probe stays as a hard backstop for any future
    code path that might call restore_once() unexpectedly.
    """
    try:
        with socket.create_connection(("127.0.0.1", port), timeout=2):
            return True
    except OSError:
        return False

def restore_once(api, rid: str):
    from huggingface_hub import snapshot_download
    from huggingface_hub.errors import RepositoryNotFoundError
    tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-"))
    try:
        snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
        for p in tmp.rglob("*"):
            rel = p.relative_to(tmp)
            if str(rel) == ".gitattributes":
                continue
            if should_skip(rel):
                continue
            target = JUPYTER_ROOT / rel
            if p.is_dir():
                target.mkdir(parents=True, exist_ok=True)
            elif p.is_file():
                target.parent.mkdir(parents=True, exist_ok=True)
                try:
                    shutil.copy2(p, target)
                except OSError as exc:
                    kind = classify_error(exc)
                    print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}")
        print(f"DevData restored from {rid}")
    except RepositoryNotFoundError:
        print(f"DevData dataset not found yet: {rid}")
    except Exception as exc:
        kind = classify_error(exc)
        print(f"DevData restore warning [{kind}]: {exc}")
    finally:
        shutil.rmtree(tmp, ignore_errors=True)

def prune_remote_deleted_files(
    api,
    rid: str,
    snapshot_dir: Path,
    skip_prefixes: set[str] | None = None,
    protected_paths: set[str] | None = None,
) -> None:
    """Delete from the HF dataset any files the user deleted locally.

    Without this, deleted files re-appear on the next Space restart because
    restore_once() copies everything in the dataset back to disk.

    Uses create_commit directly with CommitOperationDelete to avoid the extra
    list_repo_files call inside the SDK's delete_files wrapper, and batches
    deletions into PRUNE_BATCH_SIZE chunks to avoid hitting the HF API payload
    limit when many files are pruned at once.
    """
    try:
        skip_prefixes = skip_prefixes or set()
        protected_paths = protected_paths or set()
        local_files = {
            p.relative_to(snapshot_dir).as_posix()
            for p in snapshot_dir.rglob("*")
            if p.is_file()
        }
        remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset"))
        stale = [
            f for f in remote_files
            if f not in local_files
            and f != ".gitattributes"
            and f not in protected_paths
            and not any(f == prefix or f.startswith(prefix + "/") for prefix in skip_prefixes)
        ]
        if not stale:
            return

        total = len(stale)
        num_batches = (total + PRUNE_BATCH_SIZE - 1) // PRUNE_BATCH_SIZE
        ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
        for batch_idx in range(num_batches):
            batch = stale[batch_idx * PRUNE_BATCH_SIZE:(batch_idx + 1) * PRUNE_BATCH_SIZE]
            batch_label = f" (batch {batch_idx + 1}/{num_batches})" if num_batches > 1 else ""
            operations = [CommitOperationDelete(path_in_repo=p) for p in batch]
            api.create_commit(
                repo_id=rid,
                repo_type="dataset",
                operations=operations,
                commit_message=f"DevData prune {len(batch)} deleted file(s) {ts}{batch_label}",
            )
        print(f"DevData pruned {total} deleted file(s) from {rid}"
              + (f" in {num_batches} batches" if num_batches > 1 else ""))
    except Exception as exc:
        kind = classify_error(exc)
        print(f"DevData prune warning [{kind}]: {exc}")

def sync_loop(api, rid: str):
    while True:
        tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-"))
        try:
            had_copy_failures, protected_large_files = snapshot(JUPYTER_ROOT, tmp)
            upload_folder(
                folder_path=str(tmp),
                repo_id=rid,
                repo_type="dataset",
                token=HF_TOKEN,
                commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
            )
            print(f"DevData synced to {rid}")
            # BUG FIX #6: Prune files deleted locally so they don't reappear on restore.
            skip_prune_prefixes: set[str] = set()
            if had_copy_failures:
                # Snapshot copy races can produce a partial view; avoid pruning
                # runtime-heavy Jupyter paths in that case.
                skip_prune_prefixes.update({"runtime", ".local/share/jupyter/runtime"})
                print("DevData snapshot had copy failures; pruning stale files with runtime-path safeguards.")
            prune_remote_deleted_files(
                api,
                rid,
                tmp,
                skip_prefixes=skip_prune_prefixes,
                protected_paths=protected_large_files,
            )
        except Exception as exc:
            kind = classify_error(exc)
            print(f"DevData sync warning [{kind}]: {exc}")
        finally:
            shutil.rmtree(tmp, ignore_errors=True)
        time.sleep(INTERVAL)


if __name__ == "__main__":
    if not enabled():
        print("DevData sync disabled.")
        raise SystemExit(0)

    from huggingface_hub import CommitOperationDelete, HfApi, upload_folder, snapshot_download
    from huggingface_hub.errors import RepositoryNotFoundError

    api = HfApi(token=HF_TOKEN)
    rid = repo_id(api)
    try:
        api.repo_info(repo_id=rid, repo_type="dataset")
    except RepositoryNotFoundError:
        api.create_repo(repo_id=rid, repo_type="dataset", private=True)

    # ── BUG FIX #3: Restore must happen BEFORE JupyterLab starts ──────────
    # The original code always called restore_once() here, but start.sh starts
    # JupyterLab long before the gateway is ready and this script is launched.
    # That made restore_once() ALWAYS run while JupyterLab was live, which
    # overwrote its runtime/ sockets and settings → JupyterLab died.
    #
    # Fix: start.sh now calls  `python3 jupyter-devdata-sync.py --restore`
    # BEFORE starting JupyterLab.  That --restore invocation does the restore
    # and exits.  This background invocation (no --restore flag) skips straight
    # to sync_loop so it never touches files while JupyterLab is running.
    #
    # BUG FIX #2 (safety net): If JupyterLab is somehow already running when
    # this code path is reached, abort restore to avoid corrupting its state.
    if "--restore" in sys.argv:
        # Synchronous restore mode — called by start.sh before JupyterLab.
        validate_jupyter_paths()
        restore_once(api, rid)
        raise SystemExit(0)

    # Normal background sync mode — no restore; go straight to upload loop.
    validate_jupyter_paths()
    if is_jupyter_running():
        print("DevData: background sync started (JupyterLab is live, restore already done by --restore).")
    else:
        # Fallback: JupyterLab not detected.  Should not normally happen
        # because start.sh calls --restore before starting JupyterLab and then
        # waits for the gateway before launching this background process.
        # Log a warning and proceed to sync; do NOT restore to avoid racing
        # with a JupyterLab that may be in the middle of starting up.
        print("DevData: WARNING — JupyterLab not detected on port 8888. Skipping restore to be safe; starting sync loop.")

    sync_loop(api, rid)