|
|
import argparse, os, json, random, gzip, time, math, hashlib, textwrap |
|
|
from datetime import datetime |
|
|
random.seed(2025) |
|
|
|
|
|
|
|
|
|
|
|
BASE_OBJECTS = [ |
|
|
{"id": 1, "name": "Widget", "price": 9.99, "tags": ["tools", "home"]}, |
|
|
{"id": 2, "name": "Gadget", "price": 19.5, "tags": ["electronics"]}, |
|
|
{"id": 3, "name": "Thing", "price": 2.75, "tags": []}, |
|
|
] |
|
|
|
|
|
INVALID_SAMPLES = [ |
|
|
("Trailing comma", "{ \"a\": 1, }", "Remove trailing comma after the last property."), |
|
|
("Single quotes", "{ 'a': 1 }", "Use double quotes for keys and string values."), |
|
|
("Unquoted key", "{ a: 1 }", "Quote keys: { \"a\": 1 }."), |
|
|
("NaN value", "{ \"x\": NaN }", "JSON has no NaN; use null or a string."), |
|
|
("Comments", "{ /* note */ \"a\": 1 }", "JSON does not allow comments."), |
|
|
] |
|
|
|
|
|
SCHEMAS = [ |
|
|
("product", |
|
|
{ |
|
|
"$schema": "https://json-schema.org/draft/2020-12/schema", |
|
|
"title": "Product", |
|
|
"type": "object", |
|
|
"required": ["id", "name", "price", "tags"], |
|
|
"properties": { |
|
|
"id": {"type": "integer", "minimum": 1}, |
|
|
"name": {"type": "string", "minLength": 1}, |
|
|
"price": {"type": "number", "minimum": 0}, |
|
|
"tags": {"type": "array", "items": {"type": "string"}} |
|
|
}, |
|
|
"additionalProperties": False |
|
|
}), |
|
|
("invoice", |
|
|
{ |
|
|
"$schema": "https://json-schema.org/draft/2020-12/schema", |
|
|
"title": "Invoice", |
|
|
"type": "object", |
|
|
"required": ["number", "currency", "items", "total"], |
|
|
"properties": { |
|
|
"number": {"type": "string"}, |
|
|
"currency": {"type": "string"}, |
|
|
"items": {"type": "array", "items": { |
|
|
"type": "object", |
|
|
"required": ["name", "qty", "price"], |
|
|
"properties": { |
|
|
"name": {"type": "string"}, |
|
|
"qty": {"type": "integer", "minimum": 1}, |
|
|
"price": {"type": "number", "minimum": 0} |
|
|
}, |
|
|
"additionalProperties": False |
|
|
}}, |
|
|
"total": {"type": "number", "minimum": 0} |
|
|
}, |
|
|
"additionalProperties": False |
|
|
}), |
|
|
] |
|
|
|
|
|
TRANSFORM_TASKS = [ |
|
|
("Pick fields", {"keep": ["id", "name"]}), |
|
|
("Add field", {"add": {"in_stock": True}}), |
|
|
("Rename field", {"rename": {"price": "unit_price"}}), |
|
|
("Compute field", {"compute": {"value": "price * 1.2"}}), |
|
|
] |
|
|
|
|
|
API_TASKS = [ |
|
|
("Create payload", "Create JSON payload for POST /orders with fields: id (string), customer {id,name}, items [{sku,qty,price}], totals {subtotal,tax,total}."), |
|
|
("Validate payload", "Given schema below, validate a sample order and fix errors."), |
|
|
] |
|
|
|
|
|
EXPLAINERS = [ |
|
|
"Explain why this JSON is invalid and provide a fixed version.", |
|
|
"Explain differences between JSON and JSON5.", |
|
|
"Explain when to use JSON Schema and provide a minimal example.", |
|
|
] |
|
|
|
|
|
|
|
|
WRITE_PREFIX = [ |
|
|
"Generate valid JSON for the following entity. Use only double quotes, no trailing commas:\nEntity: ", |
|
|
"Produce compact JSON (no comments) for: ", |
|
|
] |
|
|
FIX_PREFIX = [ |
|
|
"Fix the invalid JSON. Provide corrected JSON only:\n", |
|
|
"The following is invalid. Output a valid JSON equivalent:\n", |
|
|
] |
|
|
EXPLAIN_PREFIX = [ |
|
|
"Explain the error and give a valid JSON version:\n", |
|
|
] |
|
|
SCHEMA_PREFIX = [ |
|
|
"Write a JSON Schema (2020-12) for the entity:\n", |
|
|
] |
|
|
SCHEMA_VALIDATE_PREFIX = [ |
|
|
"Given this schema, provide a valid sample JSON instance:\n", |
|
|
] |
|
|
TRANSFORM_PREFIX = [ |
|
|
"Transform the input JSON per the rules. Output the new JSON only.\nRules:\n", |
|
|
] |
|
|
API_PREFIX = [ |
|
|
"Design the JSON payload for this API:\n", |
|
|
"Validate this JSON against the schema and correct it.\n", |
|
|
] |
|
|
|
|
|
def to_json(obj): |
|
|
return json.dumps(obj, ensure_ascii=False, separators=(",", ":"), sort_keys=True) |
|
|
|
|
|
|
|
|
def make_write_sample(): |
|
|
base = random.choice(BASE_OBJECTS) |
|
|
entity = random.choice(["product", "catalog_item", "inventory_record"]) |
|
|
prompt = random.choice(WRITE_PREFIX) + entity |
|
|
completion = to_json(base) |
|
|
return {"prompt": prompt, "completion": completion} |
|
|
|
|
|
def make_fix_sample(): |
|
|
title, bad, tip = random.choice(INVALID_SAMPLES) |
|
|
prompt = random.choice(FIX_PREFIX) + bad |
|
|
|
|
|
fixed = bad.replace("'", '"').replace(", }", " }").replace("NaN", "null").replace("/* note */", "") |
|
|
|
|
|
try: |
|
|
obj = json.loads(fixed) |
|
|
except Exception: |
|
|
obj = {"a": 1} |
|
|
completion = to_json(obj) |
|
|
return {"prompt": prompt, "completion": completion} |
|
|
|
|
|
def make_explain_sample(): |
|
|
title, bad, tip = random.choice(INVALID_SAMPLES) |
|
|
prompt = random.choice(EXPLAIN_PREFIX) + bad |
|
|
|
|
|
fixed = bad.replace("'", '"').replace(", }", " }").replace("NaN", "null").replace("/* note */", "") |
|
|
try: |
|
|
obj = json.loads(fixed) |
|
|
except Exception: |
|
|
obj = {"a": 1} |
|
|
completion = f"{tip}\nFixed:\n{to_json(obj)}" |
|
|
return {"prompt": prompt, "completion": completion} |
|
|
|
|
|
def make_schema_sample(): |
|
|
name, schema = random.choice(SCHEMAS) |
|
|
prompt = random.choice(SCHEMA_PREFIX) + name |
|
|
completion = json.dumps(schema, ensure_ascii=False, indent=2) |
|
|
return {"prompt": prompt, "completion": completion} |
|
|
|
|
|
def make_schema_instance_sample(): |
|
|
name, schema = random.choice(SCHEMAS) |
|
|
prompt = random.choice(SCHEMA_VALIDATE_PREFIX) + json.dumps(schema, ensure_ascii=False, indent=2) |
|
|
|
|
|
if name == "product": |
|
|
instance = {"id": 1, "name": "Sample", "price": 1.23, "tags": ["demo"]} |
|
|
else: |
|
|
instance = { |
|
|
"number": "INV-001", |
|
|
"currency": "USD", |
|
|
"items": [{"name": "Sample", "qty": 1, "price": 1.0}], |
|
|
"total": 1.0 |
|
|
} |
|
|
completion = to_json(instance) |
|
|
return {"prompt": prompt, "completion": completion} |
|
|
|
|
|
def apply_transform(obj, rules): |
|
|
if "keep" in rules: |
|
|
obj = {k: obj[k] for k in rules["keep"] if k in obj} |
|
|
if "add" in rules: |
|
|
for k, v in rules["add"].items(): |
|
|
obj[k] = v |
|
|
if "rename" in rules: |
|
|
for old, new in rules["rename"].items(): |
|
|
if old in obj: |
|
|
obj[new] = obj.pop(old) |
|
|
if "compute" in rules: |
|
|
|
|
|
expr = rules["compute"].get("value", "") |
|
|
if "price" in obj and "* 1.2" in expr: |
|
|
try: |
|
|
obj["value"] = round(float(obj["price"]) * 1.2, 2) |
|
|
except Exception: |
|
|
pass |
|
|
return obj |
|
|
|
|
|
def make_transform_sample(): |
|
|
base = random.choice(BASE_OBJECTS) |
|
|
title, rules = random.choice(TRANSFORM_TASKS) |
|
|
prompt = random.choice(TRANSFORM_PREFIX) + json.dumps(rules, ensure_ascii=False, indent=2) + "\nInput:\n" + to_json(base) |
|
|
out = apply_transform(dict(base), rules) |
|
|
completion = to_json(out) |
|
|
return {"prompt": prompt, "completion": completion} |
|
|
|
|
|
def make_api_sample(): |
|
|
title, instruction = random.choice(API_TASKS) |
|
|
prompt = random.choice(API_PREFIX) + instruction |
|
|
if "Create payload" in title: |
|
|
completion = to_json({ |
|
|
"id": "ORD-1001", |
|
|
"customer": {"id": "C-1", "name": "Alex"}, |
|
|
"items": [{"sku": "ABC", "qty": 2, "price": 4.5}], |
|
|
"totals": {"subtotal": 9.0, "tax": 0.72, "total": 9.72} |
|
|
}) |
|
|
else: |
|
|
|
|
|
bad = '{"id":"ORD-1001","items":[{"sku":"ABC","qty":0}]}' |
|
|
fixed = {"id":"ORD-1001","items":[{"sku":"ABC","qty":1,"price":1.0}],"totals":{"subtotal":1.0,"tax":0.08,"total":1.08}} |
|
|
completion = f"Invalid: {bad}\nFixed: {to_json(fixed)}" |
|
|
return {"prompt": prompt, "completion": completion} |
|
|
|
|
|
MAKERS = [ |
|
|
(make_write_sample, 0.25), |
|
|
(make_fix_sample, 0.20), |
|
|
(make_explain_sample, 0.10), |
|
|
(make_schema_sample, 0.15), |
|
|
(make_schema_instance_sample, 0.10), |
|
|
(make_transform_sample, 0.10), |
|
|
(make_api_sample, 0.10), |
|
|
] |
|
|
|
|
|
def pick_maker(): |
|
|
r, acc = random.random(), 0.0 |
|
|
for fn, w in MAKERS: |
|
|
acc += w |
|
|
if r <= acc: |
|
|
return fn |
|
|
return MAKERS[-1][0] |
|
|
|
|
|
|
|
|
def write_shard(path, n_rows): |
|
|
with gzip.open(path, "wt", encoding="utf-8") as f: |
|
|
for _ in range(n_rows): |
|
|
sample = pick_maker()() |
|
|
f.write(json.dumps(sample, ensure_ascii=False) + "\n") |
|
|
|
|
|
def sha256_file(path): |
|
|
h = hashlib.sha256() |
|
|
with open(path, "rb") as rf: |
|
|
while True: |
|
|
chunk = rf.read(1 << 20) |
|
|
if not chunk: break |
|
|
h.update(chunk) |
|
|
return h.hexdigest() |
|
|
|
|
|
def main(): |
|
|
ap = argparse.ArgumentParser() |
|
|
ap.add_argument("--total", type=int, default=1_000_000) |
|
|
ap.add_argument("--shard_size", type=int, default=10_000) |
|
|
ap.add_argument("--out_dir", type=str, default="json_dataset_v1") |
|
|
ap.add_argument("--prefix", type=str, default="json") |
|
|
args = ap.parse_args() |
|
|
|
|
|
os.makedirs(args.out_dir, exist_ok=True) |
|
|
n_shards = math.ceil(args.total / args.shard_size) |
|
|
manifest = { |
|
|
"created": datetime.utcnow().isoformat() + "Z", |
|
|
"total": args.total, |
|
|
"shard_size": args.shard_size, |
|
|
"num_shards": n_shards, |
|
|
"files": [] |
|
|
} |
|
|
|
|
|
print(f"Generating {args.total:,} samples in {n_shards} shards of {args.shard_size}...") |
|
|
|
|
|
for i in range(n_shards): |
|
|
rows = args.shard_size if i < n_shards - 1 else args.total - args.shard_size * (n_shards - 1) |
|
|
shard_path = os.path.join(args.out_dir, f"{args.prefix}_{i:04d}.jsonl.gz") |
|
|
t0 = time.time() |
|
|
write_shard(shard_path, rows) |
|
|
digest = sha256_file(shard_path) |
|
|
dt = time.time() - t0 |
|
|
manifest["files"].append({"path": shard_path, "rows": rows, "sha256": digest}) |
|
|
print(f"[{i+1}/{n_shards}] wrote {rows} rows → {os.path.basename(shard_path)} in {dt:.1f}s") |
|
|
|
|
|
man_path = os.path.join(args.out_dir, f"{args.prefix}_manifest.json") |
|
|
with open(man_path, "w", encoding="utf-8") as mf: |
|
|
json.dump(manifest, mf, indent=2) |
|
|
print("Done. Manifest:", man_path) |
|
|
|
|
|
print("Sample:", json.dumps(pick_maker()(), ensure_ascii=False)) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |