subnet32-llm-detector / scripts /dataset_to_adadetect_format.py
ThaoTran7's picture
incomplete commit
485127c
#!/usr/bin/env python3
# Copyright (c) Jin Zhu.
#
# Convert a labeled dataset (e.g. SN32-style human vs LLM exports) into
# AdaDetectGPT training format: { "original": [...], "sampled": [...] }
# written to <output_prefix>.raw_data.json
#
# Examples:
# thaoluon/subnet32_dataset writes outputs/train.jsonl with keys "label" (human|ai) and "final_text":
# python scripts/dataset_to_adadetect_format.py -i ../subnet32_dataset/outputs/train.jsonl \
# --text_col final_text --label_col label -o ./exp_main/data/subnet32_train
# Generic JSONL:
# python scripts/dataset_to_adadetect_format.py -i data.jsonl --text_col text --label_col label \
# --human_labels human,0,HUMAN --ai_labels ai,1,AI,llm -o ./exp_main/data/mydata
# python scripts/dataset_to_adadetect_format.py -i packed.json --json_lists -o ./exp_main/data/mydata
import argparse
import csv
import json
import os
def _norm_label(x):
return str(x).strip().lower()
def main():
p = argparse.ArgumentParser(description="Build *.raw_data.json for AdaDetectGPT / detect_gpt_ada.py")
p.add_argument("-i", "--input", required=True, help="Path to .jsonl, .csv, or .json")
p.add_argument("-o", "--output_prefix", required=True, help="Output path without suffix (adds .raw_data.json)")
p.add_argument("--text_col", default="text", help="Column / key for passage text")
p.add_argument("--label_col", default="label", help="Column / key for class label")
p.add_argument(
"--human_labels",
default="human,0,false,human_written,h",
help="Comma-separated label values treated as human (case-insensitive match after strip)",
)
p.add_argument(
"--ai_labels",
default="ai,1,true,llm,machine,sampled,gpt,assistant",
help="Comma-separated label values treated as LLM / machine text",
)
p.add_argument(
"--json_lists",
action="store_true",
help="Input JSON is already {\"original\":[...],\"sampled\":[...]} or {\"human\":[...],\"ai\":[...]}",
)
args = p.parse_args()
human_set = {_norm_label(x) for x in args.human_labels.split(",") if x.strip()}
ai_set = {_norm_label(x) for x in args.ai_labels.split(",") if x.strip()}
original, sampled = [], []
path = args.input
if args.json_lists:
with open(path, encoding="utf-8") as f:
data = json.load(f)
if "original" in data and "sampled" in data:
original = list(data["original"])
sampled = list(data["sampled"])
elif "human" in data and ("ai" in data or "sampled" in data):
original = list(data["human"])
sampled = list(data.get("ai", data.get("sampled")))
else:
raise ValueError(
"With --json_lists, JSON must have keys (original, sampled) or (human, ai|sampled)."
)
elif path.endswith(".jsonl"):
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
row = json.loads(line)
text = row[args.text_col]
lab = _norm_label(row[args.label_col])
if lab in human_set:
original.append(text)
elif lab in ai_set:
sampled.append(text)
elif path.endswith(".csv"):
with open(path, encoding="utf-8", newline="") as f:
for row in csv.DictReader(f):
text = row[args.text_col]
lab = _norm_label(row[args.label_col])
if lab in human_set:
original.append(text)
elif lab in ai_set:
sampled.append(text)
else:
raise SystemExit("Unsupported input: use .jsonl, .csv, or .json with --json_lists")
out_path = f"{args.output_prefix}.raw_data.json"
out_dir = os.path.dirname(out_path)
if out_dir:
os.makedirs(out_dir, exist_ok=True)
payload = {"original": original, "sampled": sampled}
with open(out_path, "w", encoding="utf-8") as fout:
json.dump(payload, fout, ensure_ascii=False, indent=2)
print(f"Wrote {out_path} (human={len(original)}, llm={len(sampled)})")
if __name__ == "__main__":
main()