models-table / redact.py
adt's picture
Upload 2 files
0005359 verified
Raw
History Blame Contribute Delete
3.85 kB
#!/usr/bin/env python3
"""
Redact Models Table Pro CSV for the free HuggingFace version.
Columns matched by 'contains' logic so minor header changes don't break things.
Any column not listed in VISIBLE is fully blacked out.
"""
import csv
import hashlib
import sys
from pathlib import Path
REDACT_MARKER = "███"
# Columns with real data shown (contains-match against headers)
VISIBLE = {
"Model", "Lab", "Playground",
"Params (total", "Params (active", "Arch",
"Tokens trained", "Data ratio", "H100 cost to train",
"ALScore", "MMLU", "GPQA", "HLE",
"Training dataset", "Announced", "Public?", "Disclosure score",
"Paper / Repo", "Tags", "Notes", "Count (rough)",
"Audit", "Params total confidence", "Params active confidence",
"Tokens confidence",
"License", "Context window", "Country",
}
# Always blacked out (subset of VISIBLE that we show headers for but never reveal data)
ALWAYS_BLACKOUT = {
"H100 cost to train",
"Audit", "Params total confidence", "Params active confidence", "Tokens confidence",
}
# Never touched by hash redaction
NEVER_REDACT = {"Model", "Lab", "Announced"}
# Hash picks an extra redaction from these
DOUBLE_REDACT = {"License", "Context window", "Country"}
def clean_header(h):
return " ".join(h.split()).strip()
def match(header, name_set):
h = header.lower()
return any(n.lower() in h for n in name_set)
def resolve(headers, name_set):
return {i for i, h in enumerate(headers) if match(h, name_set)}
def hash_redact(model_name, redactable, double_list):
h = hashlib.sha256(model_name.encode()).hexdigest()
return [
redactable[int(h[:8], 16) % len(redactable)],
double_list[int(h[8:16], 16) % len(double_list)],
]
def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <input.csv> [output.csv]")
sys.exit(1)
infile = Path(sys.argv[1])
outfile = Path(sys.argv[2]) if len(sys.argv) > 2 else infile.with_name("models-table-free.csv")
with open(infile, newline="", encoding="utf-8") as f:
reader = csv.reader(f)
next(reader)
raw_headers = next(reader)
data_rows = list(reader)
headers = [clean_header(h) for h in raw_headers]
while headers and not headers[-1]:
headers.pop()
num_cols = len(headers)
visible_idx = resolve(headers, VISIBLE)
blackout_idx = (set(range(num_cols)) - visible_idx) | resolve(headers, ALWAYS_BLACKOUT)
never_idx = resolve(headers, NEVER_REDACT)
double_list = sorted(resolve(headers, DOUBLE_REDACT))
redactable = sorted(visible_idx - never_idx - blackout_idx - set(double_list))
unmatched = [n for n in VISIBLE if not any(n.lower() in h.lower() for h in headers)]
if unmatched:
print(f"WARNING: not found in CSV: {unmatched}")
out_rows = []
for row in data_rows:
model_name = row[0].strip() if row else ""
if not model_name:
continue
full = list(row[:num_cols])
full.extend("" for _ in range(num_cols - len(full)))
for c in blackout_idx:
full[c] = REDACT_MARKER
for c in hash_redact(model_name, redactable, double_list):
full[c] = REDACT_MARKER
out_rows.append(full)
with open(outfile, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(out_rows)
print(f"Wrote {len(out_rows)} rows, {num_cols} columns to {outfile}")
from collections import Counter
dist = Counter()
for row in out_rows:
for i, v in enumerate(row):
if v == REDACT_MARKER:
dist[headers[i]] += 1
print("\nRedaction distribution:")
for col, count in dist.most_common():
print(f" {col}: {count}")
if __name__ == "__main__":
main()