Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Redact Models Table Pro CSV for the free HuggingFace version. | |
| Columns matched by 'contains' logic so minor header changes don't break things. | |
| Any column not listed in VISIBLE is fully blacked out. | |
| """ | |
| import csv | |
| import hashlib | |
| import sys | |
| from pathlib import Path | |
| REDACT_MARKER = "███" | |
| # Columns with real data shown (contains-match against headers) | |
| VISIBLE = { | |
| "Model", "Lab", "Playground", | |
| "Params (total", "Params (active", "Arch", | |
| "Tokens trained", "Data ratio", "H100 cost to train", | |
| "ALScore", "MMLU", "GPQA", "HLE", | |
| "Training dataset", "Announced", "Public?", "Disclosure score", | |
| "Paper / Repo", "Tags", "Notes", "Count (rough)", | |
| "Audit", "Params total confidence", "Params active confidence", | |
| "Tokens confidence", | |
| "License", "Context window", "Country", | |
| } | |
| # Always blacked out (subset of VISIBLE that we show headers for but never reveal data) | |
| ALWAYS_BLACKOUT = { | |
| "H100 cost to train", | |
| "Audit", "Params total confidence", "Params active confidence", "Tokens confidence", | |
| } | |
| # Never touched by hash redaction | |
| NEVER_REDACT = {"Model", "Lab", "Announced"} | |
| # Hash picks an extra redaction from these | |
| DOUBLE_REDACT = {"License", "Context window", "Country"} | |
| def clean_header(h): | |
| return " ".join(h.split()).strip() | |
| def match(header, name_set): | |
| h = header.lower() | |
| return any(n.lower() in h for n in name_set) | |
| def resolve(headers, name_set): | |
| return {i for i, h in enumerate(headers) if match(h, name_set)} | |
| def hash_redact(model_name, redactable, double_list): | |
| h = hashlib.sha256(model_name.encode()).hexdigest() | |
| return [ | |
| redactable[int(h[:8], 16) % len(redactable)], | |
| double_list[int(h[8:16], 16) % len(double_list)], | |
| ] | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print(f"Usage: {sys.argv[0]} <input.csv> [output.csv]") | |
| sys.exit(1) | |
| infile = Path(sys.argv[1]) | |
| outfile = Path(sys.argv[2]) if len(sys.argv) > 2 else infile.with_name("models-table-free.csv") | |
| with open(infile, newline="", encoding="utf-8") as f: | |
| reader = csv.reader(f) | |
| next(reader) | |
| raw_headers = next(reader) | |
| data_rows = list(reader) | |
| headers = [clean_header(h) for h in raw_headers] | |
| while headers and not headers[-1]: | |
| headers.pop() | |
| num_cols = len(headers) | |
| visible_idx = resolve(headers, VISIBLE) | |
| blackout_idx = (set(range(num_cols)) - visible_idx) | resolve(headers, ALWAYS_BLACKOUT) | |
| never_idx = resolve(headers, NEVER_REDACT) | |
| double_list = sorted(resolve(headers, DOUBLE_REDACT)) | |
| redactable = sorted(visible_idx - never_idx - blackout_idx - set(double_list)) | |
| unmatched = [n for n in VISIBLE if not any(n.lower() in h.lower() for h in headers)] | |
| if unmatched: | |
| print(f"WARNING: not found in CSV: {unmatched}") | |
| out_rows = [] | |
| for row in data_rows: | |
| model_name = row[0].strip() if row else "" | |
| if not model_name: | |
| continue | |
| full = list(row[:num_cols]) | |
| full.extend("" for _ in range(num_cols - len(full))) | |
| for c in blackout_idx: | |
| full[c] = REDACT_MARKER | |
| for c in hash_redact(model_name, redactable, double_list): | |
| full[c] = REDACT_MARKER | |
| out_rows.append(full) | |
| with open(outfile, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow(headers) | |
| writer.writerows(out_rows) | |
| print(f"Wrote {len(out_rows)} rows, {num_cols} columns to {outfile}") | |
| from collections import Counter | |
| dist = Counter() | |
| for row in out_rows: | |
| for i, v in enumerate(row): | |
| if v == REDACT_MARKER: | |
| dist[headers[i]] += 1 | |
| print("\nRedaction distribution:") | |
| for col, count in dist.most_common(): | |
| print(f" {col}: {count}") | |
| if __name__ == "__main__": | |
| main() | |