#!/usr/bin/env python3
"""Her · हेर — bulk session uploader (scan → scrub → upload, with your approval).

Brings your Claude Code sessions into the private Her Space so you get a full Projects
view. It NEVER touches your originals: it COPIES the sessions you pick into a local
staging folder, SCRUBS likely secrets from the copies, then UPLOADS them — pausing for
your approval at each of the three steps.

Pure standard library — no pip installs. Run:

    python her_upload.py
    python her_upload.py --space build-small-hackathon/her        # override the Space
    python her_upload.py --projects-dir ~/.claude/projects     # override the source

Auth: uses your Hugging Face token (HF_TOKEN env, else ~/.cache/huggingface/token —
created by `hf auth login`). Required because the Space is private.

PRIVACY: the scrubber is best-effort (you review the redaction summary before upload),
and your uploads auto-delete from the Space after 24h (or when you click "clear my data"
/ close the tab). Nothing here ever modifies ~/.claude.
"""
from __future__ import annotations

import argparse
import glob
import json
import os
import re
import shutil
import sys
import uuid
import urllib.request
import urllib.error
from pathlib import Path

DEFAULT_SPACE = "build-small-hackathon/her"


# --------------------------------------------------------------------------- #
# small console helpers
# --------------------------------------------------------------------------- #
def c(txt, color="orange"):
    codes = {"orange": "38;5;208", "red": "31", "green": "32", "cyan": "36", "dim": "2", "bold": "1"}
    return f"\033[{codes.get(color,'0')}m{txt}\033[0m"


def hr():
    print(c("─" * 64, "dim"))


def ask(prompt: str) -> str:
    try:
        return input(prompt).strip()
    except (EOFError, KeyboardInterrupt):
        print("\naborted.")
        sys.exit(1)


def confirm(prompt: str) -> bool:
    return ask(prompt + " [y/N] ").lower() in ("y", "yes")


# --------------------------------------------------------------------------- #
# auth + host
# --------------------------------------------------------------------------- #
def hf_token() -> str:
    tok = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
    if tok:
        return tok.strip()
    for p in (Path.home() / ".cache/huggingface/token", Path.home() / ".huggingface/token"):
        try:
            t = p.read_text(encoding="utf-8").strip()
            if t:
                return t
        except OSError:
            pass
    print(c("No Hugging Face token found.", "red"))
    print("Run `hf auth login` (or set HF_TOKEN) so the script can reach your private Space.")
    sys.exit(1)


def space_host(space_id: str) -> str:
    # owner/name -> owner-name.hf.space (HF lowercases and dashes the id)
    return space_id.replace("/", "-").lower() + ".hf.space"


# --------------------------------------------------------------------------- #
# scan projects (read the REAL cwd from inside each file — like the engine does)
# --------------------------------------------------------------------------- #
def read_cwd(path: str):
    try:
        with open(path, "r", encoding="utf-8") as fh:
            for line in fh:
                line = line.strip()
                if not line:
                    continue
                try:
                    r = json.loads(line)
                except ValueError:
                    continue
                if isinstance(r, dict) and r.get("type") in ("user", "assistant") and r.get("cwd"):
                    return r.get("cwd")
    except OSError:
        return None
    return None


def scan(projects_dir: str):
    """Return [{encoded, cwd, files:[paths]}] grouped by the encoded project folder."""
    groups = {}
    for fp in glob.glob(os.path.join(projects_dir, "*", "*.jsonl")):
        enc = os.path.basename(os.path.dirname(fp))
        groups.setdefault(enc, {"encoded": enc, "cwd": None, "files": []})
        groups[enc]["files"].append(os.path.abspath(fp))
    for g in groups.values():
        g["files"].sort()
        for f in g["files"]:
            cwd = read_cwd(f)
            if cwd:
                g["cwd"] = cwd
                break
    out = list(groups.values())
    out.sort(key=lambda g: (g["cwd"] or g["encoded"]).lower())
    return out


def parse_selection(sel: str, n: int):
    sel = sel.strip().lower()
    if sel in ("all", "*", "a"):
        return list(range(n))
    picked = set()
    for part in sel.replace(" ", "").split(","):
        if not part:
            continue
        if "-" in part:
            try:
                a, b = part.split("-", 1)
                for i in range(int(a), int(b) + 1):
                    if 1 <= i <= n:
                        picked.add(i - 1)
            except ValueError:
                pass
        elif part.isdigit():
            i = int(part)
            if 1 <= i <= n:
                picked.add(i - 1)
    return sorted(picked)


# --------------------------------------------------------------------------- #
# scrubber — best-effort secret redaction (you review the summary before upload)
# --------------------------------------------------------------------------- #
_REPL = "[REDACTED]"
_WHOLE = [
    ("private key block", re.compile(r"-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----", re.S)),
    ("openai/anthropic key", re.compile(r"\b(?:sk|sk-ant|sk-proj)-[A-Za-z0-9_\-]{20,}\b")),
    ("hf token", re.compile(r"\bhf_[A-Za-z0-9]{20,}\b")),
    ("github token", re.compile(r"\bgh[posru]_[A-Za-z0-9]{30,}\b")),
    ("aws access key id", re.compile(r"\b(?:AKIA|ASIA)[0-9A-Z]{16}\b")),
    ("google api key", re.compile(r"\bAIza[0-9A-Za-z_\-]{35}\b")),
    ("slack token", re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b")),
    ("bearer token", re.compile(r"(?i)\bBearer\s+[A-Za-z0-9._\-]{16,}")),
    ("jwt", re.compile(r"\beyJ[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}\b")),
]
# group1 = the key + separator (+ an optional opening quote, possibly JSON-escaped as \");
# group2 = the secret value (stops at a quote, backslash, whitespace, or JSON delimiter,
# so it works whether the value is bare or wrapped in escaped quotes inside the JSONL).
_KV = re.compile(
    r"(?i)(\"?(?:password|passwd|secret|token|api[_-]?key|access[_-]?key|client[_-]?secret|auth[_-]?token)\"?\s*[:=]\s*(?:\\?\")?)"
    r"([^\"\\\s,}{]{6,})"
)


def scrub_text(text: str):
    counts = {}
    for name, pat in _WHOLE:
        text, n = pat.subn(_REPL, text)
        if n:
            counts[name] = counts.get(name, 0) + n
    def _kv(m):
        return m.group(1) + _REPL
    text, n = _KV.subn(_kv, text)
    if n:
        counts["key=value secret"] = counts.get("key=value secret", 0) + n
    return text, counts


# --------------------------------------------------------------------------- #
# upload (stdlib multipart)
# --------------------------------------------------------------------------- #
def upload_file(host: str, token: str, client: str, project: str, filename: str, data: bytes):
    boundary = "----her" + uuid.uuid4().hex
    pre = b""
    for k, v in (("project", project),):
        pre += (f"--{boundary}\r\nContent-Disposition: form-data; name=\"{k}\"\r\n\r\n{v}\r\n").encode()
    pre += (
        f"--{boundary}\r\nContent-Disposition: form-data; name=\"file\"; filename=\"{filename}\"\r\n"
        f"Content-Type: application/jsonl\r\n\r\n"
    ).encode()
    body = pre + data + b"\r\n" + f"--{boundary}--\r\n".encode()
    req = urllib.request.Request(
        f"https://{host}/api/upload",
        data=body,
        method="POST",
        headers={
            "Content-Type": f"multipart/form-data; boundary={boundary}",
            "Authorization": f"Bearer {token}",
            "X-Her-Client": client,
        },
    )
    with urllib.request.urlopen(req, timeout=120) as resp:
        return json.loads(resp.read().decode("utf-8"))


# --------------------------------------------------------------------------- #
# main
# --------------------------------------------------------------------------- #
def main():
    ap = argparse.ArgumentParser(description="Bulk-upload Claude Code sessions to your Her Space.")
    ap.add_argument("--space", default=os.environ.get("HER_SPACE", DEFAULT_SPACE), help="HF Space id (owner/name)")
    ap.add_argument("--host", default=os.environ.get("HER_HOST"), help="override the *.hf.space host")
    ap.add_argument("--projects-dir", default=os.path.expanduser("~/.claude/projects"))
    ap.add_argument("--staging", default=os.path.abspath("./her-staging"))
    args = ap.parse_args()

    host = args.host or space_host(args.space)
    token = hf_token()
    client = uuid.uuid4().hex  # this upload's private namespace; the open-URL carries it

    print(c("\nHer · हेर — bring your sessions in", "bold"))
    print(c(f"Space: {args.space}  ({host})", "dim"))
    print(c(f"Source: {args.projects_dir}", "dim"))

    # ---- STEP 1: SELECT ---------------------------------------------------- #
    hr(); print(c("STEP 1 / 3 · choose projects", "cyan"))
    groups = scan(args.projects_dir)
    if not groups:
        print(c(f"No .jsonl sessions found under {args.projects_dir}", "red"))
        sys.exit(1)
    for i, g in enumerate(groups, 1):
        print(f"  {i:>2}. {c(g['cwd'] or g['encoded'], 'orange')}  "
              + c(f"({len(g['files'])} session{'s' if len(g['files'])!=1 else ''})", "dim"))
    print(c("\nEnter numbers (e.g. 1,3,5 or 2-6), or 'all'.", "dim"))
    picks = parse_selection(ask("Select projects: "), len(groups))
    if not picks:
        print("Nothing selected."); sys.exit(0)
    chosen = [groups[i] for i in picks]
    total_files = sum(len(g["files"]) for g in chosen)
    print(c(f"\n→ {len(chosen)} project(s), {total_files} session(s) selected.", "green"))
    if not confirm("Copy these into the staging folder and continue?"):
        sys.exit(0)

    # ---- STEP 2: COPY + SCRUB --------------------------------------------- #
    hr(); print(c("STEP 2 / 3 · copy to staging + scrub secrets", "cyan"))
    staging = Path(args.staging)
    if staging.exists():
        shutil.rmtree(staging, ignore_errors=True)
    staging.mkdir(parents=True, exist_ok=True)
    staged = []          # (project_encoded, staged_path, original_name)
    redaction_totals = {}
    files_with_redactions = 0
    for g in chosen:
        outdir = staging / g["encoded"]
        outdir.mkdir(parents=True, exist_ok=True)
        for src in g["files"]:
            try:
                raw = Path(src).read_text(encoding="utf-8", errors="replace")
            except OSError:
                continue
            cleaned, counts = scrub_text(raw)
            if counts:
                files_with_redactions += 1
                for k, v in counts.items():
                    redaction_totals[k] = redaction_totals.get(k, 0) + v
            dst = outdir / os.path.basename(src)
            dst.write_text(cleaned, encoding="utf-8")
            staged.append((g["encoded"], dst, os.path.basename(src)))
    print(c(f"Staged {len(staged)} scrubbed session(s) → {staging}", "green"))
    if redaction_totals:
        print(c(f"Redacted likely secrets in {files_with_redactions} file(s):", "orange"))
        for k, v in sorted(redaction_totals.items(), key=lambda x: -x[1]):
            print(f"   · {k}: {v}")
    else:
        print(c("No obvious secrets matched (the scrubber is best-effort — review if unsure).", "dim"))
    print(c(f"\nYou can inspect the scrubbed copies in {staging} before uploading.", "dim"))
    if not confirm("Upload these scrubbed sessions to your private Space?"):
        print("Stopped before upload. Staging kept for your review."); sys.exit(0)

    # ---- STEP 3: UPLOAD ---------------------------------------------------- #
    hr(); print(c("STEP 3 / 3 · upload", "cyan"))
    ok = 0
    for idx, (enc, path, name) in enumerate(staged, 1):
        try:
            data = path.read_bytes()
            upload_file(host, token, client, enc, name, data)
            ok += 1
            print(f"  [{idx}/{len(staged)}] {c('uploaded', 'green')} {enc}/{name}")
        except urllib.error.HTTPError as e:
            print(f"  [{idx}/{len(staged)}] {c('FAILED', 'red')} {name}: HTTP {e.code} {e.reason}")
        except Exception as e:  # noqa: BLE001
            print(f"  [{idx}/{len(staged)}] {c('FAILED', 'red')} {name}: {e}")

    hr()
    if ok == 0:
        print(c("No sessions uploaded.", "red")); sys.exit(1)
    print(c(f"✅ Uploaded {ok}/{len(staged)} session(s).", "green"))
    spaces_url = f"https://huggingface.co/spaces/{args.space}?client={client}"
    print("\nOpen your Projects view (bound to this upload):")
    print("   " + c(spaces_url, "cyan"))
    print(c("\n⏳ Give it a few seconds on first open — the Space analyzes the sessions and", "orange"))
    print(c("   the local model writes the cross-session summary. If a project briefly shows", "orange"))
    print(c("   “no sessions found”, just wait a moment and refresh; it’s still generating.", "orange"))
    print(c("\nIf your projects don't appear, open the Space, then in the browser console run:", "dim"))
    print(c(f"   localStorage.setItem('her.clientId','{client}'); location.reload()", "dim"))
    print(c("\nReminder: your uploads auto-delete after 24h, or instantly via “clear my data”.", "dim"))


if __name__ == "__main__":
    main()