Spaces:
Running
Running
File size: 3,851 Bytes
c19426b 0005359 c19426b 0005359 9ecc2e4 0005359 9ecc2e4 c19426b 0005359 c19426b 0005359 9ecc2e4 c19426b 0005359 9ecc2e4 c19426b 0005359 9ecc2e4 0005359 9ecc2e4 0005359 c19426b 0005359 c19426b 0005359 c19426b 0005359 9ecc2e4 0005359 9ecc2e4 0005359 9ecc2e4 0005359 c19426b 9ecc2e4 0005359 c19426b 9ecc2e4 c19426b 0005359 c19426b 9ecc2e4 c19426b 9ecc2e4 c19426b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | #!/usr/bin/env python3
"""
Redact Models Table Pro CSV for the free HuggingFace version.
Columns matched by 'contains' logic so minor header changes don't break things.
Any column not listed in VISIBLE is fully blacked out.
"""
import csv
import hashlib
import sys
from pathlib import Path
REDACT_MARKER = "███"
# Columns with real data shown (contains-match against headers)
VISIBLE = {
"Model", "Lab", "Playground",
"Params (total", "Params (active", "Arch",
"Tokens trained", "Data ratio", "H100 cost to train",
"ALScore", "MMLU", "GPQA", "HLE",
"Training dataset", "Announced", "Public?", "Disclosure score",
"Paper / Repo", "Tags", "Notes", "Count (rough)",
"Audit", "Params total confidence", "Params active confidence",
"Tokens confidence",
"License", "Context window", "Country",
}
# Always blacked out (subset of VISIBLE that we show headers for but never reveal data)
ALWAYS_BLACKOUT = {
"H100 cost to train",
"Audit", "Params total confidence", "Params active confidence", "Tokens confidence",
}
# Never touched by hash redaction
NEVER_REDACT = {"Model", "Lab", "Announced"}
# Hash picks an extra redaction from these
DOUBLE_REDACT = {"License", "Context window", "Country"}
def clean_header(h):
return " ".join(h.split()).strip()
def match(header, name_set):
h = header.lower()
return any(n.lower() in h for n in name_set)
def resolve(headers, name_set):
return {i for i, h in enumerate(headers) if match(h, name_set)}
def hash_redact(model_name, redactable, double_list):
h = hashlib.sha256(model_name.encode()).hexdigest()
return [
redactable[int(h[:8], 16) % len(redactable)],
double_list[int(h[8:16], 16) % len(double_list)],
]
def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <input.csv> [output.csv]")
sys.exit(1)
infile = Path(sys.argv[1])
outfile = Path(sys.argv[2]) if len(sys.argv) > 2 else infile.with_name("models-table-free.csv")
with open(infile, newline="", encoding="utf-8") as f:
reader = csv.reader(f)
next(reader)
raw_headers = next(reader)
data_rows = list(reader)
headers = [clean_header(h) for h in raw_headers]
while headers and not headers[-1]:
headers.pop()
num_cols = len(headers)
visible_idx = resolve(headers, VISIBLE)
blackout_idx = (set(range(num_cols)) - visible_idx) | resolve(headers, ALWAYS_BLACKOUT)
never_idx = resolve(headers, NEVER_REDACT)
double_list = sorted(resolve(headers, DOUBLE_REDACT))
redactable = sorted(visible_idx - never_idx - blackout_idx - set(double_list))
unmatched = [n for n in VISIBLE if not any(n.lower() in h.lower() for h in headers)]
if unmatched:
print(f"WARNING: not found in CSV: {unmatched}")
out_rows = []
for row in data_rows:
model_name = row[0].strip() if row else ""
if not model_name:
continue
full = list(row[:num_cols])
full.extend("" for _ in range(num_cols - len(full)))
for c in blackout_idx:
full[c] = REDACT_MARKER
for c in hash_redact(model_name, redactable, double_list):
full[c] = REDACT_MARKER
out_rows.append(full)
with open(outfile, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(out_rows)
print(f"Wrote {len(out_rows)} rows, {num_cols} columns to {outfile}")
from collections import Counter
dist = Counter()
for row in out_rows:
for i, v in enumerate(row):
if v == REDACT_MARKER:
dist[headers[i]] += 1
print("\nRedaction distribution:")
for col, count in dist.most_common():
print(f" {col}: {count}")
if __name__ == "__main__":
main()
|