File size: 3,851 Bytes
c19426b
 
 
0005359
 
c19426b
 
 
 
 
 
 
 
 
0005359
 
9ecc2e4
0005359
 
 
9ecc2e4
 
 
 
 
 
c19426b
0005359
 
 
 
 
c19426b
0005359
9ecc2e4
c19426b
0005359
9ecc2e4
c19426b
 
0005359
9ecc2e4
 
 
0005359
 
 
 
 
 
 
9ecc2e4
 
0005359
c19426b
0005359
 
 
 
c19426b
 
 
 
 
 
 
 
 
 
 
 
0005359
 
c19426b
 
0005359
9ecc2e4
 
 
 
0005359
 
 
 
 
9ecc2e4
0005359
9ecc2e4
0005359
c19426b
 
 
 
 
 
 
9ecc2e4
0005359
c19426b
9ecc2e4
c19426b
0005359
c19426b
 
9ecc2e4
c19426b
 
 
 
 
 
9ecc2e4
c19426b
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
"""
Redact Models Table Pro CSV for the free HuggingFace version.
Columns matched by 'contains' logic so minor header changes don't break things.
Any column not listed in VISIBLE is fully blacked out.
"""

import csv
import hashlib
import sys
from pathlib import Path

REDACT_MARKER = "███"

# Columns with real data shown (contains-match against headers)
VISIBLE = {
    "Model", "Lab", "Playground",
    "Params (total", "Params (active", "Arch",
    "Tokens trained", "Data ratio", "H100 cost to train",
    "ALScore", "MMLU", "GPQA", "HLE",
    "Training dataset", "Announced", "Public?", "Disclosure score",
    "Paper / Repo", "Tags", "Notes", "Count (rough)",
    "Audit", "Params total confidence", "Params active confidence",
    "Tokens confidence",
    "License", "Context window", "Country",
}

# Always blacked out (subset of VISIBLE that we show headers for but never reveal data)
ALWAYS_BLACKOUT = {
    "H100 cost to train",
    "Audit", "Params total confidence", "Params active confidence", "Tokens confidence",
}

# Never touched by hash redaction
NEVER_REDACT = {"Model", "Lab", "Announced"}

# Hash picks an extra redaction from these
DOUBLE_REDACT = {"License", "Context window", "Country"}


def clean_header(h):
    return " ".join(h.split()).strip()


def match(header, name_set):
    h = header.lower()
    return any(n.lower() in h for n in name_set)


def resolve(headers, name_set):
    return {i for i, h in enumerate(headers) if match(h, name_set)}


def hash_redact(model_name, redactable, double_list):
    h = hashlib.sha256(model_name.encode()).hexdigest()
    return [
        redactable[int(h[:8], 16) % len(redactable)],
        double_list[int(h[8:16], 16) % len(double_list)],
    ]


def main():
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <input.csv> [output.csv]")
        sys.exit(1)

    infile = Path(sys.argv[1])
    outfile = Path(sys.argv[2]) if len(sys.argv) > 2 else infile.with_name("models-table-free.csv")

    with open(infile, newline="", encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader)
        raw_headers = next(reader)
        data_rows = list(reader)

    headers = [clean_header(h) for h in raw_headers]
    while headers and not headers[-1]:
        headers.pop()
    num_cols = len(headers)

    visible_idx = resolve(headers, VISIBLE)
    blackout_idx = (set(range(num_cols)) - visible_idx) | resolve(headers, ALWAYS_BLACKOUT)
    never_idx = resolve(headers, NEVER_REDACT)
    double_list = sorted(resolve(headers, DOUBLE_REDACT))
    redactable = sorted(visible_idx - never_idx - blackout_idx - set(double_list))

    unmatched = [n for n in VISIBLE if not any(n.lower() in h.lower() for h in headers)]
    if unmatched:
        print(f"WARNING: not found in CSV: {unmatched}")

    out_rows = []
    for row in data_rows:
        model_name = row[0].strip() if row else ""
        if not model_name:
            continue

        full = list(row[:num_cols])
        full.extend("" for _ in range(num_cols - len(full)))

        for c in blackout_idx:
            full[c] = REDACT_MARKER
        for c in hash_redact(model_name, redactable, double_list):
            full[c] = REDACT_MARKER

        out_rows.append(full)

    with open(outfile, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        writer.writerows(out_rows)

    print(f"Wrote {len(out_rows)} rows, {num_cols} columns to {outfile}")

    from collections import Counter
    dist = Counter()
    for row in out_rows:
        for i, v in enumerate(row):
            if v == REDACT_MARKER:
                dist[headers[i]] += 1
    print("\nRedaction distribution:")
    for col, count in dist.most_common():
        print(f"  {col}: {count}")


if __name__ == "__main__":
    main()