Spaces:
Running
Running
| """Build data.json for the Gradio Space. | |
| Positions: pre-computed 3D embeddings of the `src/transformers/**` source files | |
| (downloaded from a private HF bucket). Falls back to PCA on the original | |
| 384-dim embeddings if the 3D file is missing. | |
| Colors: recency-weighted edit score from `git log` on the cloned transformers repo. | |
| """ | |
| import datetime as dt | |
| import json | |
| import math | |
| import os | |
| import re | |
| import subprocess | |
| import urllib.error | |
| import urllib.request | |
| from collections import defaultdict | |
| from pathlib import Path | |
| import numpy as np | |
| ROOT = Path(__file__).parent | |
| REPO_DIR = ROOT / ".cache" / "transformers" | |
| EMBEDDINGS_3D_FILE = ROOT / ".cache" / "transformers-embeddings-src-3d.jsonl" | |
| EMBEDDINGS_3D_URL = ( | |
| "https://huggingface.co/buckets/the-best-team/data/resolve/" | |
| "transformers-embeddings-src-3d.jsonl" | |
| ) | |
| EMBEDDINGS_FILE = ROOT / ".cache" / "transformers-embeddings-src.jsonl" | |
| EMBEDDINGS_URL = ( | |
| "https://huggingface.co/buckets/the-best-team/data/resolve/" | |
| "transformers-embeddings-src.jsonl" | |
| ) | |
| DATA_FILE = ROOT / "data.json" | |
| SRC_PREFIX = "src/transformers/" | |
| HALF_LIFE_SECONDS = 365 * 24 * 3600 # 1 year | |
| # Files whose path matches any of these regexes are dropped from the point cloud. | |
| SKIP_PATH_PATTERNS = [ | |
| re.compile(r"(^|/)__init__\.py$"), | |
| re.compile(r"(^|/)modeling_.*\.py$"), | |
| re.compile(r"^src/transformers/cli/transformers\.py$"), | |
| ] | |
| def is_skipped(path): | |
| return any(p.search(path) for p in SKIP_PATH_PATTERNS) | |
| def run(cmd): | |
| return subprocess.run(cmd, check=True, capture_output=True, text=True).stdout | |
| def hf_token(): | |
| p = Path.home() / ".cache" / "huggingface" / "token" | |
| return p.read_text().strip() if p.exists() else os.environ.get("HF_TOKEN", "") | |
| def download(url, dest): | |
| if dest.exists(): | |
| return True | |
| dest.parent.mkdir(parents=True, exist_ok=True) | |
| try: | |
| req = urllib.request.Request( | |
| url, headers={"Authorization": f"Bearer {hf_token()}"} | |
| ) | |
| with urllib.request.urlopen(req) as resp, dest.open("wb") as out: | |
| out.write(resp.read()) | |
| return True | |
| except (urllib.error.URLError, urllib.error.HTTPError) as e: | |
| print(f" download failed for {url}: {e}") | |
| return False | |
| def load_embeddings_3d(): | |
| """Primary source: per-file 3D vectors keyed under `reduced_embedding`. | |
| Returns ordered (paths, coords) or (None, None) if the file isn't available. | |
| """ | |
| if not download(EMBEDDINGS_3D_URL, EMBEDDINGS_3D_FILE): | |
| return None, None | |
| paths, vecs = [], [] | |
| with EMBEDDINGS_3D_FILE.open() as f: | |
| for line in f: | |
| d = json.loads(line) | |
| paths.append(SRC_PREFIX + d["path"]) | |
| vecs.append(d["reduced_embedding"]) | |
| return paths, np.asarray(vecs, dtype=np.float64) | |
| def load_embeddings_pca_fallback(): | |
| """Fallback: load 384-dim embeddings and reduce via PCA.""" | |
| if not download(EMBEDDINGS_URL, EMBEDDINGS_FILE): | |
| raise RuntimeError("Neither the 3D nor the 384-dim embedding file is available.") | |
| paths, vecs = [], [] | |
| with EMBEDDINGS_FILE.open() as f: | |
| for line in f: | |
| d = json.loads(line) | |
| paths.append(SRC_PREFIX + d["path"]) | |
| vecs.append(d["embedding"]) | |
| matrix = np.asarray(vecs, dtype=np.float64) | |
| return paths, pca_3d(matrix) | |
| def pca_3d(matrix): | |
| """Project (N, D) → (N, 3) via centered SVD. Scale each axis to roughly unit std.""" | |
| X = matrix - matrix.mean(axis=0, keepdims=True) | |
| _, _, Vt = np.linalg.svd(X, full_matrices=False) | |
| proj = X @ Vt[:3].T | |
| proj /= proj.std(axis=0, keepdims=True) + 1e-12 | |
| return proj | |
| def load_positions(): | |
| """Pre-computed 3D embeddings if available, else PCA on the 384-dim file.""" | |
| paths, coords = load_embeddings_3d() | |
| if paths is not None: | |
| print(f"Using pre-computed 3D embeddings: {len(paths)} files.") | |
| return paths, coords | |
| print("3D embeddings unavailable; falling back to PCA on 384-dim file.") | |
| return load_embeddings_pca_fallback() | |
| def edit_timelines(): | |
| out = run( | |
| [ | |
| "git", "-C", str(REPO_DIR), | |
| "log", "--name-only", "--pretty=format:COMMIT:%ct", | |
| ] | |
| ) | |
| timelines = defaultdict(list) | |
| current_ts = None | |
| for line in out.split("\n"): | |
| if line.startswith("COMMIT:"): | |
| current_ts = int(line[len("COMMIT:"):]) | |
| elif line.strip() and current_ts is not None: | |
| timelines[line.strip()].append(current_ts) | |
| return timelines | |
| def recency_weighted_score(timestamps, now_ts): | |
| """Sum of exp-decayed edit weights: recent edits weigh more, old ones fade.""" | |
| if not timestamps: | |
| return 0.0 | |
| return sum(0.5 ** ((now_ts - ts) / HALF_LIFE_SECONDS) for ts in timestamps) | |
| def redness_scores(scores): | |
| """Log-compress, min-max normalize, invert so high score → 0 (red).""" | |
| log_scores = [math.log1p(s) for s in scores] | |
| lo, hi = min(log_scores), max(log_scores) | |
| span = (hi - lo) or 1.0 | |
| return [1.0 - (ls - lo) / span for ls in log_scores] | |
| def main(): | |
| paths, coords = load_positions() | |
| keep = [i for i, p in enumerate(paths) if not is_skipped(p)] | |
| if len(keep) < len(paths): | |
| print(f"Skipping {len(paths) - len(keep)} files via SKIP_PATH_PATTERNS.") | |
| paths = [paths[i] for i in keep] | |
| coords = coords[keep] | |
| print(f"Per-axis std: {coords.std(axis=0)}") | |
| timelines = edit_timelines() | |
| now_ts = int(dt.datetime.now().timestamp()) | |
| scores, edit_times, hovers = [], [], [] | |
| for p in paths: | |
| ts_list = timelines.get(p, []) | |
| scores.append(recency_weighted_score(ts_list, now_ts)) | |
| edit_times.append(ts_list) | |
| last = dt.date.fromtimestamp(max(ts_list)).isoformat() if ts_list else "never" | |
| hovers.append(f"{p}<br>edits: {len(ts_list)} (last: {last})") | |
| color_values = redness_scores(scores) | |
| data = { | |
| "x": coords[:, 0].tolist(), | |
| "y": coords[:, 1].tolist(), | |
| "z": coords[:, 2].tolist(), | |
| "color": color_values, | |
| "edit_times": edit_times, | |
| "hover": hovers, | |
| } | |
| DATA_FILE.write_text(json.dumps(data)) | |
| print( | |
| f"Wrote {DATA_FILE} — {len(paths)} points, " | |
| f"max recency-weighted score: {max(scores):.2f}" | |
| ) | |
| if __name__ == "__main__": | |
| main() | |