subnet32-llm-detector / scripts /dataset_to_adadetect_format.py

incomplete commit

485127c 8 days ago

4.28 kB

	#!/usr/bin/env python3
	# Copyright (c) Jin Zhu.
	#
	# Convert a labeled dataset (e.g. SN32-style human vs LLM exports) into
	# AdaDetectGPT training format: { "original": [...], "sampled": [...] }
	# written to <output_prefix>.raw_data.json
	#
	# Examples:
	# thaoluon/subnet32_dataset writes outputs/train.jsonl with keys "label" (human\|ai) and "final_text":
	# python scripts/dataset_to_adadetect_format.py -i ../subnet32_dataset/outputs/train.jsonl \
	# --text_col final_text --label_col label -o ./exp_main/data/subnet32_train
	# Generic JSONL:
	# python scripts/dataset_to_adadetect_format.py -i data.jsonl --text_col text --label_col label \
	# --human_labels human,0,HUMAN --ai_labels ai,1,AI,llm -o ./exp_main/data/mydata
	# python scripts/dataset_to_adadetect_format.py -i packed.json --json_lists -o ./exp_main/data/mydata

	import argparse
	import csv
	import json
	import os


	def _norm_label(x):
	return str(x).strip().lower()


	def main():
	p = argparse.ArgumentParser(description="Build *.raw_data.json for AdaDetectGPT / detect_gpt_ada.py")
	p.add_argument("-i", "--input", required=True, help="Path to .jsonl, .csv, or .json")
	p.add_argument("-o", "--output_prefix", required=True, help="Output path without suffix (adds .raw_data.json)")
	p.add_argument("--text_col", default="text", help="Column / key for passage text")
	p.add_argument("--label_col", default="label", help="Column / key for class label")
	p.add_argument(
	"--human_labels",
	default="human,0,false,human_written,h",
	help="Comma-separated label values treated as human (case-insensitive match after strip)",
	)
	p.add_argument(
	"--ai_labels",
	default="ai,1,true,llm,machine,sampled,gpt,assistant",
	help="Comma-separated label values treated as LLM / machine text",
	)
	p.add_argument(
	"--json_lists",
	action="store_true",
	help="Input JSON is already {\"original\":[...],\"sampled\":[...]} or {\"human\":[...],\"ai\":[...]}",
	)
	args = p.parse_args()

	human_set = {_norm_label(x) for x in args.human_labels.split(",") if x.strip()}
	ai_set = {_norm_label(x) for x in args.ai_labels.split(",") if x.strip()}

	original, sampled = [], []
	path = args.input

	if args.json_lists:
	with open(path, encoding="utf-8") as f:
	data = json.load(f)
	if "original" in data and "sampled" in data:
	original = list(data["original"])
	sampled = list(data["sampled"])
	elif "human" in data and ("ai" in data or "sampled" in data):
	original = list(data["human"])
	sampled = list(data.get("ai", data.get("sampled")))
	else:
	raise ValueError(
	"With --json_lists, JSON must have keys (original, sampled) or (human, ai\|sampled)."
	)
	elif path.endswith(".jsonl"):
	with open(path, encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	row = json.loads(line)
	text = row[args.text_col]
	lab = _norm_label(row[args.label_col])
	if lab in human_set:
	original.append(text)
	elif lab in ai_set:
	sampled.append(text)
	elif path.endswith(".csv"):
	with open(path, encoding="utf-8", newline="") as f:
	for row in csv.DictReader(f):
	text = row[args.text_col]
	lab = _norm_label(row[args.label_col])
	if lab in human_set:
	original.append(text)
	elif lab in ai_set:
	sampled.append(text)
	else:
	raise SystemExit("Unsupported input: use .jsonl, .csv, or .json with --json_lists")

	out_path = f"{args.output_prefix}.raw_data.json"
	out_dir = os.path.dirname(out_path)
	if out_dir:
	os.makedirs(out_dir, exist_ok=True)
	payload = {"original": original, "sampled": sampled}
	with open(out_path, "w", encoding="utf-8") as fout:
	json.dump(payload, fout, ensure_ascii=False, indent=2)
	print(f"Wrote {out_path} (human={len(original)}, llm={len(sampled)})")


	if __name__ == "__main__":
	main()