Percy3822 commited on
Commit
1f320b1
·
verified ·
1 Parent(s): f919818

Create make_json_dataset.py

Browse files
Files changed (1) hide show
  1. make_json_dataset.py +280 -0
make_json_dataset.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse, os, json, random, gzip, time, math, hashlib, textwrap
2
+ from datetime import datetime
3
+ random.seed(2025)
4
+
5
+ # ------------ Content pools (JSON-focused) ------------
6
+ # Simple base objects we combine/perturb
7
+ BASE_OBJECTS = [
8
+ {"id": 1, "name": "Widget", "price": 9.99, "tags": ["tools", "home"]},
9
+ {"id": 2, "name": "Gadget", "price": 19.5, "tags": ["electronics"]},
10
+ {"id": 3, "name": "Thing", "price": 2.75, "tags": []},
11
+ ]
12
+
13
+ INVALID_SAMPLES = [
14
+ ("Trailing comma", "{ \"a\": 1, }", "Remove trailing comma after the last property."),
15
+ ("Single quotes", "{ 'a': 1 }", "Use double quotes for keys and string values."),
16
+ ("Unquoted key", "{ a: 1 }", "Quote keys: { \"a\": 1 }."),
17
+ ("NaN value", "{ \"x\": NaN }", "JSON has no NaN; use null or a string."),
18
+ ("Comments", "{ /* note */ \"a\": 1 }", "JSON does not allow comments."),
19
+ ]
20
+
21
+ SCHEMAS = [
22
+ ("product",
23
+ {
24
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
25
+ "title": "Product",
26
+ "type": "object",
27
+ "required": ["id", "name", "price", "tags"],
28
+ "properties": {
29
+ "id": {"type": "integer", "minimum": 1},
30
+ "name": {"type": "string", "minLength": 1},
31
+ "price": {"type": "number", "minimum": 0},
32
+ "tags": {"type": "array", "items": {"type": "string"}}
33
+ },
34
+ "additionalProperties": False
35
+ }),
36
+ ("invoice",
37
+ {
38
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
39
+ "title": "Invoice",
40
+ "type": "object",
41
+ "required": ["number", "currency", "items", "total"],
42
+ "properties": {
43
+ "number": {"type": "string"},
44
+ "currency": {"type": "string"},
45
+ "items": {"type": "array", "items": {
46
+ "type": "object",
47
+ "required": ["name", "qty", "price"],
48
+ "properties": {
49
+ "name": {"type": "string"},
50
+ "qty": {"type": "integer", "minimum": 1},
51
+ "price": {"type": "number", "minimum": 0}
52
+ },
53
+ "additionalProperties": False
54
+ }},
55
+ "total": {"type": "number", "minimum": 0}
56
+ },
57
+ "additionalProperties": False
58
+ }),
59
+ ]
60
+
61
+ TRANSFORM_TASKS = [
62
+ ("Pick fields", {"keep": ["id", "name"]}),
63
+ ("Add field", {"add": {"in_stock": True}}),
64
+ ("Rename field", {"rename": {"price": "unit_price"}}),
65
+ ("Compute field", {"compute": {"value": "price * 1.2"}}),
66
+ ]
67
+
68
+ API_TASKS = [
69
+ ("Create payload", "Create JSON payload for POST /orders with fields: id (string), customer {id,name}, items [{sku,qty,price}], totals {subtotal,tax,total}."),
70
+ ("Validate payload", "Given schema below, validate a sample order and fix errors."),
71
+ ]
72
+
73
+ EXPLAINERS = [
74
+ "Explain why this JSON is invalid and provide a fixed version.",
75
+ "Explain differences between JSON and JSON5.",
76
+ "Explain when to use JSON Schema and provide a minimal example.",
77
+ ]
78
+
79
+ # ------------ Prompt styles ------------
80
+ WRITE_PREFIX = [
81
+ "Generate valid JSON for the following entity. Use only double quotes, no trailing commas:\nEntity: ",
82
+ "Produce compact JSON (no comments) for: ",
83
+ ]
84
+ FIX_PREFIX = [
85
+ "Fix the invalid JSON. Provide corrected JSON only:\n",
86
+ "The following is invalid. Output a valid JSON equivalent:\n",
87
+ ]
88
+ EXPLAIN_PREFIX = [
89
+ "Explain the error and give a valid JSON version:\n",
90
+ ]
91
+ SCHEMA_PREFIX = [
92
+ "Write a JSON Schema (2020-12) for the entity:\n",
93
+ ]
94
+ SCHEMA_VALIDATE_PREFIX = [
95
+ "Given this schema, provide a valid sample JSON instance:\n",
96
+ ]
97
+ TRANSFORM_PREFIX = [
98
+ "Transform the input JSON per the rules. Output the new JSON only.\nRules:\n",
99
+ ]
100
+ API_PREFIX = [
101
+ "Design the JSON payload for this API:\n",
102
+ "Validate this JSON against the schema and correct it.\n",
103
+ ]
104
+
105
+ def to_json(obj): # pretty and deterministic field order
106
+ return json.dumps(obj, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
107
+
108
+ # ------------ Sample builders ------------
109
+ def make_write_sample():
110
+ base = random.choice(BASE_OBJECTS)
111
+ entity = random.choice(["product", "catalog_item", "inventory_record"])
112
+ prompt = random.choice(WRITE_PREFIX) + entity
113
+ completion = to_json(base)
114
+ return {"prompt": prompt, "completion": completion}
115
+
116
+ def make_fix_sample():
117
+ title, bad, tip = random.choice(INVALID_SAMPLES)
118
+ prompt = random.choice(FIX_PREFIX) + bad
119
+ # naive fix strategy:
120
+ fixed = bad.replace("'", '"').replace(", }", " }").replace("NaN", "null").replace("/* note */", "")
121
+ # ensure valid JSON, fallback to simple {"a":1}
122
+ try:
123
+ obj = json.loads(fixed)
124
+ except Exception:
125
+ obj = {"a": 1}
126
+ completion = to_json(obj)
127
+ return {"prompt": prompt, "completion": completion}
128
+
129
+ def make_explain_sample():
130
+ title, bad, tip = random.choice(INVALID_SAMPLES)
131
+ prompt = random.choice(EXPLAIN_PREFIX) + bad
132
+ # fixed version
133
+ fixed = bad.replace("'", '"').replace(", }", " }").replace("NaN", "null").replace("/* note */", "")
134
+ try:
135
+ obj = json.loads(fixed)
136
+ except Exception:
137
+ obj = {"a": 1}
138
+ completion = f"{tip}\nFixed:\n{to_json(obj)}"
139
+ return {"prompt": prompt, "completion": completion}
140
+
141
+ def make_schema_sample():
142
+ name, schema = random.choice(SCHEMAS)
143
+ prompt = random.choice(SCHEMA_PREFIX) + name
144
+ completion = json.dumps(schema, ensure_ascii=False, indent=2)
145
+ return {"prompt": prompt, "completion": completion}
146
+
147
+ def make_schema_instance_sample():
148
+ name, schema = random.choice(SCHEMAS)
149
+ prompt = random.choice(SCHEMA_VALIDATE_PREFIX) + json.dumps(schema, ensure_ascii=False, indent=2)
150
+ # very naive instance generator
151
+ if name == "product":
152
+ instance = {"id": 1, "name": "Sample", "price": 1.23, "tags": ["demo"]}
153
+ else:
154
+ instance = {
155
+ "number": "INV-001",
156
+ "currency": "USD",
157
+ "items": [{"name": "Sample", "qty": 1, "price": 1.0}],
158
+ "total": 1.0
159
+ }
160
+ completion = to_json(instance)
161
+ return {"prompt": prompt, "completion": completion}
162
+
163
+ def apply_transform(obj, rules):
164
+ if "keep" in rules:
165
+ obj = {k: obj[k] for k in rules["keep"] if k in obj}
166
+ if "add" in rules:
167
+ for k, v in rules["add"].items():
168
+ obj[k] = v
169
+ if "rename" in rules:
170
+ for old, new in rules["rename"].items():
171
+ if old in obj:
172
+ obj[new] = obj.pop(old)
173
+ if "compute" in rules:
174
+ # allow only simple expression "price * 1.2"
175
+ expr = rules["compute"].get("value", "")
176
+ if "price" in obj and "* 1.2" in expr:
177
+ try:
178
+ obj["value"] = round(float(obj["price"]) * 1.2, 2)
179
+ except Exception:
180
+ pass
181
+ return obj
182
+
183
+ def make_transform_sample():
184
+ base = random.choice(BASE_OBJECTS)
185
+ title, rules = random.choice(TRANSFORM_TASKS)
186
+ prompt = random.choice(TRANSFORM_PREFIX) + json.dumps(rules, ensure_ascii=False, indent=2) + "\nInput:\n" + to_json(base)
187
+ out = apply_transform(dict(base), rules)
188
+ completion = to_json(out)
189
+ return {"prompt": prompt, "completion": completion}
190
+
191
+ def make_api_sample():
192
+ title, instruction = random.choice(API_TASKS)
193
+ prompt = random.choice(API_PREFIX) + instruction
194
+ if "Create payload" in title:
195
+ completion = to_json({
196
+ "id": "ORD-1001",
197
+ "customer": {"id": "C-1", "name": "Alex"},
198
+ "items": [{"sku": "ABC", "qty": 2, "price": 4.5}],
199
+ "totals": {"subtotal": 9.0, "tax": 0.72, "total": 9.72}
200
+ })
201
+ else:
202
+ # minimal example of "invalid" -> "fixed"
203
+ bad = '{"id":"ORD-1001","items":[{"sku":"ABC","qty":0}]}'
204
+ fixed = {"id":"ORD-1001","items":[{"sku":"ABC","qty":1,"price":1.0}],"totals":{"subtotal":1.0,"tax":0.08,"total":1.08}}
205
+ completion = f"Invalid: {bad}\nFixed: {to_json(fixed)}"
206
+ return {"prompt": prompt, "completion": completion}
207
+
208
+ MAKERS = [
209
+ (make_write_sample, 0.25),
210
+ (make_fix_sample, 0.20),
211
+ (make_explain_sample, 0.10),
212
+ (make_schema_sample, 0.15),
213
+ (make_schema_instance_sample, 0.10),
214
+ (make_transform_sample, 0.10),
215
+ (make_api_sample, 0.10),
216
+ ]
217
+
218
+ def pick_maker():
219
+ r, acc = random.random(), 0.0
220
+ for fn, w in MAKERS:
221
+ acc += w
222
+ if r <= acc:
223
+ return fn
224
+ return MAKERS[-1][0]
225
+
226
+ # ------------ Shard writer ------------
227
+ def write_shard(path, n_rows):
228
+ with gzip.open(path, "wt", encoding="utf-8") as f:
229
+ for _ in range(n_rows):
230
+ sample = pick_maker()()
231
+ f.write(json.dumps(sample, ensure_ascii=False) + "\n")
232
+
233
+ def sha256_file(path):
234
+ h = hashlib.sha256()
235
+ with open(path, "rb") as rf:
236
+ while True:
237
+ chunk = rf.read(1 << 20)
238
+ if not chunk: break
239
+ h.update(chunk)
240
+ return h.hexdigest()
241
+
242
+ def main():
243
+ ap = argparse.ArgumentParser()
244
+ ap.add_argument("--total", type=int, default=1_000_000)
245
+ ap.add_argument("--shard_size", type=int, default=10_000)
246
+ ap.add_argument("--out_dir", type=str, default="json_dataset_v1")
247
+ ap.add_argument("--prefix", type=str, default="json")
248
+ args = ap.parse_args()
249
+
250
+ os.makedirs(args.out_dir, exist_ok=True)
251
+ n_shards = math.ceil(args.total / args.shard_size)
252
+ manifest = {
253
+ "created": datetime.utcnow().isoformat() + "Z",
254
+ "total": args.total,
255
+ "shard_size": args.shard_size,
256
+ "num_shards": n_shards,
257
+ "files": []
258
+ }
259
+
260
+ print(f"Generating {args.total:,} samples in {n_shards} shards of {args.shard_size}...")
261
+
262
+ for i in range(n_shards):
263
+ rows = args.shard_size if i < n_shards - 1 else args.total - args.shard_size * (n_shards - 1)
264
+ shard_path = os.path.join(args.out_dir, f"{args.prefix}_{i:04d}.jsonl.gz")
265
+ t0 = time.time()
266
+ write_shard(shard_path, rows)
267
+ digest = sha256_file(shard_path)
268
+ dt = time.time() - t0
269
+ manifest["files"].append({"path": shard_path, "rows": rows, "sha256": digest})
270
+ print(f"[{i+1}/{n_shards}] wrote {rows} rows → {os.path.basename(shard_path)} in {dt:.1f}s")
271
+
272
+ man_path = os.path.join(args.out_dir, f"{args.prefix}_manifest.json")
273
+ with open(man_path, "w", encoding="utf-8") as mf:
274
+ json.dump(manifest, mf, indent=2)
275
+ print("Done. Manifest:", man_path)
276
+ # print a sample row
277
+ print("Sample:", json.dumps(pick_maker()(), ensure_ascii=False))
278
+
279
+ if __name__ == "__main__":
280
+ main()