Spaces:

adt
/

models-table

Running

App Files Files Community

models-table / redact.py

adt

Upload 2 files

0005359 verified 14 days ago

Raw

History Blame Contribute Delete

3.85 kB

	#!/usr/bin/env python3
	"""
	Redact Models Table Pro CSV for the free HuggingFace version.
	Columns matched by 'contains' logic so minor header changes don't break things.
	Any column not listed in VISIBLE is fully blacked out.
	"""

	import csv
	import hashlib
	import sys
	from pathlib import Path

	REDACT_MARKER = "███"

	# Columns with real data shown (contains-match against headers)
	VISIBLE = {
	"Model", "Lab", "Playground",
	"Params (total", "Params (active", "Arch",
	"Tokens trained", "Data ratio", "H100 cost to train",
	"ALScore", "MMLU", "GPQA", "HLE",
	"Training dataset", "Announced", "Public?", "Disclosure score",
	"Paper / Repo", "Tags", "Notes", "Count (rough)",
	"Audit", "Params total confidence", "Params active confidence",
	"Tokens confidence",
	"License", "Context window", "Country",
	}

	# Always blacked out (subset of VISIBLE that we show headers for but never reveal data)
	ALWAYS_BLACKOUT = {
	"H100 cost to train",
	"Audit", "Params total confidence", "Params active confidence", "Tokens confidence",
	}

	# Never touched by hash redaction
	NEVER_REDACT = {"Model", "Lab", "Announced"}

	# Hash picks an extra redaction from these
	DOUBLE_REDACT = {"License", "Context window", "Country"}


	def clean_header(h):
	return " ".join(h.split()).strip()


	def match(header, name_set):
	h = header.lower()
	return any(n.lower() in h for n in name_set)


	def resolve(headers, name_set):
	return {i for i, h in enumerate(headers) if match(h, name_set)}


	def hash_redact(model_name, redactable, double_list):
	h = hashlib.sha256(model_name.encode()).hexdigest()
	return [
	redactable[int(h[:8], 16) % len(redactable)],
	double_list[int(h[8:16], 16) % len(double_list)],
	]


	def main():
	if len(sys.argv) < 2:
	print(f"Usage: {sys.argv[0]} <input.csv> [output.csv]")
	sys.exit(1)

	infile = Path(sys.argv[1])
	outfile = Path(sys.argv[2]) if len(sys.argv) > 2 else infile.with_name("models-table-free.csv")

	with open(infile, newline="", encoding="utf-8") as f:
	reader = csv.reader(f)
	next(reader)
	raw_headers = next(reader)
	data_rows = list(reader)

	headers = [clean_header(h) for h in raw_headers]
	while headers and not headers[-1]:
	headers.pop()
	num_cols = len(headers)

	visible_idx = resolve(headers, VISIBLE)
	blackout_idx = (set(range(num_cols)) - visible_idx) \| resolve(headers, ALWAYS_BLACKOUT)
	never_idx = resolve(headers, NEVER_REDACT)
	double_list = sorted(resolve(headers, DOUBLE_REDACT))
	redactable = sorted(visible_idx - never_idx - blackout_idx - set(double_list))

	unmatched = [n for n in VISIBLE if not any(n.lower() in h.lower() for h in headers)]
	if unmatched:
	print(f"WARNING: not found in CSV: {unmatched}")

	out_rows = []
	for row in data_rows:
	model_name = row[0].strip() if row else ""
	if not model_name:
	continue

	full = list(row[:num_cols])
	full.extend("" for _ in range(num_cols - len(full)))

	for c in blackout_idx:
	full[c] = REDACT_MARKER
	for c in hash_redact(model_name, redactable, double_list):
	full[c] = REDACT_MARKER

	out_rows.append(full)

	with open(outfile, "w", newline="", encoding="utf-8") as f:
	writer = csv.writer(f)
	writer.writerow(headers)
	writer.writerows(out_rows)

	print(f"Wrote {len(out_rows)} rows, {num_cols} columns to {outfile}")

	from collections import Counter
	dist = Counter()
	for row in out_rows:
	for i, v in enumerate(row):
	if v == REDACT_MARKER:
	dist[headers[i]] += 1
	print("\nRedaction distribution:")
	for col, count in dist.most_common():
	print(f" {col}: {count}")


	if __name__ == "__main__":
	main()