| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import argparse |
| import csv |
| import json |
| import os |
|
|
|
|
| def _norm_label(x): |
| return str(x).strip().lower() |
|
|
|
|
| def main(): |
| p = argparse.ArgumentParser(description="Build *.raw_data.json for AdaDetectGPT / detect_gpt_ada.py") |
| p.add_argument("-i", "--input", required=True, help="Path to .jsonl, .csv, or .json") |
| p.add_argument("-o", "--output_prefix", required=True, help="Output path without suffix (adds .raw_data.json)") |
| p.add_argument("--text_col", default="text", help="Column / key for passage text") |
| p.add_argument("--label_col", default="label", help="Column / key for class label") |
| p.add_argument( |
| "--human_labels", |
| default="human,0,false,human_written,h", |
| help="Comma-separated label values treated as human (case-insensitive match after strip)", |
| ) |
| p.add_argument( |
| "--ai_labels", |
| default="ai,1,true,llm,machine,sampled,gpt,assistant", |
| help="Comma-separated label values treated as LLM / machine text", |
| ) |
| p.add_argument( |
| "--json_lists", |
| action="store_true", |
| help="Input JSON is already {\"original\":[...],\"sampled\":[...]} or {\"human\":[...],\"ai\":[...]}", |
| ) |
| args = p.parse_args() |
|
|
| human_set = {_norm_label(x) for x in args.human_labels.split(",") if x.strip()} |
| ai_set = {_norm_label(x) for x in args.ai_labels.split(",") if x.strip()} |
|
|
| original, sampled = [], [] |
| path = args.input |
|
|
| if args.json_lists: |
| with open(path, encoding="utf-8") as f: |
| data = json.load(f) |
| if "original" in data and "sampled" in data: |
| original = list(data["original"]) |
| sampled = list(data["sampled"]) |
| elif "human" in data and ("ai" in data or "sampled" in data): |
| original = list(data["human"]) |
| sampled = list(data.get("ai", data.get("sampled"))) |
| else: |
| raise ValueError( |
| "With --json_lists, JSON must have keys (original, sampled) or (human, ai|sampled)." |
| ) |
| elif path.endswith(".jsonl"): |
| with open(path, encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| row = json.loads(line) |
| text = row[args.text_col] |
| lab = _norm_label(row[args.label_col]) |
| if lab in human_set: |
| original.append(text) |
| elif lab in ai_set: |
| sampled.append(text) |
| elif path.endswith(".csv"): |
| with open(path, encoding="utf-8", newline="") as f: |
| for row in csv.DictReader(f): |
| text = row[args.text_col] |
| lab = _norm_label(row[args.label_col]) |
| if lab in human_set: |
| original.append(text) |
| elif lab in ai_set: |
| sampled.append(text) |
| else: |
| raise SystemExit("Unsupported input: use .jsonl, .csv, or .json with --json_lists") |
|
|
| out_path = f"{args.output_prefix}.raw_data.json" |
| out_dir = os.path.dirname(out_path) |
| if out_dir: |
| os.makedirs(out_dir, exist_ok=True) |
| payload = {"original": original, "sampled": sampled} |
| with open(out_path, "w", encoding="utf-8") as fout: |
| json.dump(payload, fout, ensure_ascii=False, indent=2) |
| print(f"Wrote {out_path} (human={len(original)}, llm={len(sampled)})") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|