LaelaZ commited on
Commit
35ff0c7
·
verified ·
1 Parent(s): 3b33545

Sync package to GitHub source: em-dashes out of rendered output; no API/logic change

Browse files
synthkit/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """synthkit: generate synthetic data and grade it for quality.
2
+
3
+ Three products on one core:
4
+ • text (A): instruction / eval datasets for training & evaluating LLMs [live]
5
+ • tabular (B): schema-aware fixtures with referential integrity [roadmap]
6
+ • privacy (C): privacy-safe synthetic twins of real datasets [roadmap]
7
+
8
+ The generator layer differs per product; the grading engine, providers, and
9
+ report writers are shared.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ __version__ = "0.4.0"
synthkit/__main__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """`python3 -m synthkit …`"""
2
+ from synthkit.cli import main
3
+
4
+ if __name__ == "__main__":
5
+ main()
synthkit/cli.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Command-line interface for synthkit."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import sys
6
+ from typing import List
7
+
8
+ from synthkit import __version__
9
+ from synthkit.formats import FORMATS, to_format
10
+ from synthkit.grading import grade_dataset
11
+ from synthkit.io_utils import load_spec, read_records, write_jsonl, write_text
12
+ from synthkit.models import SynthkitError
13
+ from synthkit.providers import get_embedder, get_provider
14
+ from synthkit.report import print_report, to_html, to_json
15
+ from synthkit.text.generate import generate
16
+ from synthkit.text.seeds import BUILTIN_SEEDS, DEMO_EVAL
17
+
18
+ _GRADE_ORDER = ["F", "D", "C", "B", "A", "A+"]
19
+
20
+
21
+ def _progress(done: int, total: int) -> None:
22
+ end = "\n" if done >= total else ""
23
+ print(f"\r generated {done}/{total}", end=end, file=sys.stderr, flush=True)
24
+
25
+
26
+ def _maybe_embedder(args):
27
+ if not getattr(args, "semantic", False):
28
+ return None
29
+ return get_embedder(getattr(args, "embed_provider", "ollama"),
30
+ getattr(args, "embed_model", ""))
31
+
32
+
33
+ # ---- text gen ----------------------------------------------------------------
34
+
35
+ def _run_demo(args) -> int:
36
+ print("synthkit demo: generating a coding eval set, then grading it…",
37
+ file=sys.stderr)
38
+ # Train and eval are drawn from DISJOINT tasks (a genuine held-out split), then a
39
+ # known handful of eval records are deliberately leaked into train, so the
40
+ # contamination axis reflects real leakage, not one generator overlapping itself.
41
+ tasks = DEMO_EVAL["slots"]["task"]
42
+ train_spec = {**DEMO_EVAL, "slots": {**DEMO_EVAL["slots"], "task": tasks[:7]}}
43
+ eval_spec = {**DEMO_EVAL, "slots": {**DEMO_EVAL["slots"], "task": tasks[7:]}}
44
+ data = generate(train_spec, 195, seed=17)
45
+ bench = generate(eval_spec, 40, seed=99)
46
+ leaks = [dict(r) for r in bench[:5]] # 5 genuine, verbatim leaks
47
+ data = data + leaks
48
+ write_jsonl("synthkit_demo.jsonl", data)
49
+ write_jsonl("synthkit_demo.benchmark.jsonl", bench)
50
+
51
+ report = grade_dataset(data, against=bench, ngram=args.ngram)
52
+ print_report(report, dataset="synthkit_demo.jsonl", use_color=not args.no_color)
53
+ write_text("synthkit_demo.report.json", to_json(report, "synthkit_demo.jsonl"))
54
+ write_text("synthkit_demo.report.html", to_html(report, "synthkit_demo.jsonl"))
55
+ print(f" wrote synthkit_demo.jsonl ({len(data)} records) + benchmark ({len(bench)}) "
56
+ "+ .report.json + .report.html", file=sys.stderr)
57
+ print(" note: eval uses HELD-OUT tasks; 5 records were deliberately leaked into "
58
+ "train, so contamination flags exactly those real leaks.", file=sys.stderr)
59
+ return 0
60
+
61
+
62
+ def cmd_text_gen(args) -> int:
63
+ if args.demo:
64
+ return _run_demo(args)
65
+ if not args.seed:
66
+ sys.exit("error: provide --seed FILE (a JSON/YAML seed spec) or use --demo")
67
+
68
+ spec = load_spec(args.seed)
69
+ if args.kind:
70
+ spec["kind"] = args.kind
71
+ provider = get_provider(args.provider, args.model)
72
+ dedup_embedder = get_embedder(args.embed_provider, args.embed_model) if args.dedup_semantic else None
73
+ busy = args.provider != "none" or args.dedup_semantic
74
+ show = _progress if (busy and not args.no_progress) else None
75
+ gstats: dict = {}
76
+
77
+ print(f"synthkit: generating {args.num} records from {args.seed}…", file=sys.stderr)
78
+ data = generate(spec, args.num, provider=provider, seed=args.seed_int,
79
+ dedup=not args.no_dedup, concurrency=args.concurrency, progress=show,
80
+ dedup_embedder=dedup_embedder, dedup_threshold=args.dedup_threshold,
81
+ stats=gstats)
82
+ if args.dedup_semantic and gstats.get("rejected_semantic"):
83
+ print(f" semantic dedup: rejected {gstats['rejected_semantic']} of "
84
+ f"{gstats['attempts']} candidates (cosine ≥ {args.dedup_threshold})",
85
+ file=sys.stderr)
86
+ if len(data) < args.num:
87
+ reason = ("raise --dedup-threshold or add slot variety" if args.dedup_semantic
88
+ else "the seed's template×slot space is exhausted "
89
+ "(add slot variety or pass --no-dedup)")
90
+ print(f" note: produced {len(data)} of {args.num} requested, {reason}.",
91
+ file=sys.stderr)
92
+
93
+ out = args.out or "synth_text.jsonl"
94
+ write_jsonl(out, to_format(data, args.format))
95
+ print(f" wrote {len(data)} records to {out}"
96
+ + (f" ({args.format} format)" if args.format != "raw" else ""))
97
+
98
+ if not args.no_grade:
99
+ against = read_records(args.against) if args.against else None
100
+ report = grade_dataset(data, against=against, ngram=args.ngram,
101
+ embedder=_maybe_embedder(args))
102
+ print_report(report, dataset=out, use_color=not args.no_color)
103
+ if args.report_json:
104
+ write_text(args.report_json, to_json(report, out))
105
+ if args.html:
106
+ write_text(args.html, to_html(report, out))
107
+ return 0
108
+
109
+
110
+ # ---- grade -------------------------------------------------------------------
111
+
112
+ def cmd_grade(args) -> int:
113
+ records = read_records(args.path)
114
+ if not records:
115
+ sys.exit(f"error: no records found in {args.path}")
116
+ against = read_records(args.against) if args.against else None
117
+ report = grade_dataset(records, fields=args.field or None,
118
+ against=against, ngram=args.ngram,
119
+ embedder=_maybe_embedder(args))
120
+ print_report(report, dataset=args.path, use_color=not args.no_color)
121
+ if args.json:
122
+ write_text(args.json, to_json(report, args.path))
123
+ print(f" JSON written to {args.json}")
124
+ if args.html:
125
+ write_text(args.html, to_html(report, args.path))
126
+ print(f" HTML written to {args.html}")
127
+ if args.min_grade:
128
+ if _GRADE_ORDER.index(report.grade) < _GRADE_ORDER.index(args.min_grade):
129
+ print(f" grade {report.grade} is below --min-grade {args.min_grade}",
130
+ file=sys.stderr)
131
+ return 1
132
+ return 0
133
+
134
+
135
+ # ---- list / roadmap ----------------------------------------------------------
136
+
137
+ def cmd_list(args) -> int:
138
+ print("\nsynthkit products\n")
139
+ print(" text (A) live LLM instruction & eval datasets + quality grading")
140
+ print(" tabular (B) roadmap schema-aware fixtures with referential integrity")
141
+ print(" privacy (C) roadmap privacy-safe synthetic twins of real datasets")
142
+ print("\n built-in text seeds (use with: text gen --demo, or copy from examples/)\n")
143
+ for name, spec in BUILTIN_SEEDS.items():
144
+ t = len(spec["templates"])
145
+ slots = " × ".join(f"{len(v)} {k}" for k, v in spec["slots"].items())
146
+ print(f" {name:<12} {spec['kind']:<12} {t} templates · {slots}")
147
+ print()
148
+ return 0
149
+
150
+
151
+ def cmd_coming_soon(args) -> int:
152
+ print(f"\n synthkit {args.product}: {args.blurb}")
153
+ print(" On the roadmap. Product A (`synthkit text`) is live today and B/C share")
154
+ print(" the same core: providers, the grading engine, and the report writer.\n")
155
+ return 0
156
+
157
+
158
+ # ---- parser ------------------------------------------------------------------
159
+
160
+ def build_parser() -> argparse.ArgumentParser:
161
+ p = argparse.ArgumentParser(
162
+ prog="synthkit",
163
+ description="Generate synthetic data and grade it for quality "
164
+ "(validity, uniqueness, diversity, contamination).")
165
+ p.add_argument("--version", action="version", version=f"synthkit {__version__}")
166
+ sub = p.add_subparsers(dest="cmd")
167
+
168
+ # text (Product A) with its own subcommands
169
+ text_p = sub.add_parser("text", help="Product A: LLM instruction & eval datasets")
170
+ text_sub = text_p.add_subparsers(dest="text_cmd")
171
+ gen = text_sub.add_parser("gen", help="generate a text dataset and grade it")
172
+ gen.add_argument("--demo", action="store_true",
173
+ help="generate + grade a built-in coding eval set (no setup)")
174
+ gen.add_argument("--seed", help="path to a JSON/YAML seed spec")
175
+ gen.add_argument("-n", "--num", type=int, default=200, help="records to generate")
176
+ gen.add_argument("--kind", choices=["eval", "instruction"],
177
+ help="override the spec's kind")
178
+ gen.add_argument("--provider", default="none",
179
+ choices=["none", "ollama", "anthropic", "openai"],
180
+ help="response generator for instruction data (default: none)")
181
+ gen.add_argument("--model", default="", help="model name for the provider")
182
+ gen.add_argument("-o", "--out", help="output JSONL path (default: synth_text.jsonl)")
183
+ gen.add_argument("--seed-int", type=int, default=17, help="RNG seed (default: 17)")
184
+ gen.add_argument("--no-dedup", action="store_true",
185
+ help="keep exact-duplicate prompts instead of skipping them")
186
+ gen.add_argument("--no-grade", action="store_true", help="skip grading the output")
187
+ gen.add_argument("--against", help="held-out eval set to check contamination against")
188
+ gen.add_argument("--ngram", type=int, default=8, help="contamination n-gram size")
189
+ gen.add_argument("--format", default="raw", choices=list(FORMATS),
190
+ help="output schema: raw|alpaca|sharegpt|openai (default: raw)")
191
+ gen.add_argument("--concurrency", type=int, default=4,
192
+ help="parallel provider calls when filling responses (default: 4)")
193
+ gen.add_argument("--no-progress", action="store_true",
194
+ help="hide the response progress line")
195
+ gen.add_argument("--semantic", action="store_true",
196
+ help="add an embedding-based semantic-dedup axis to grading")
197
+ gen.add_argument("--embed-provider", default="ollama", choices=["ollama", "openai"],
198
+ help="embedder for --semantic (default: ollama)")
199
+ gen.add_argument("--embed-model", default="", help="embedding model name")
200
+ gen.add_argument("--dedup-semantic", action="store_true",
201
+ help="reject semantically-similar records during generation "
202
+ "(clean-by-construction; uses --embed-provider)")
203
+ gen.add_argument("--dedup-threshold", type=float, default=0.9,
204
+ help="cosine ≥ this ⇒ reject as a semantic duplicate (default: 0.9)")
205
+ gen.add_argument("--report-json", help="write the grade report as JSON")
206
+ gen.add_argument("--html", help="write the grade report as HTML")
207
+ gen.add_argument("--no-color", action="store_true", help="disable ANSI colors")
208
+ gen.set_defaults(func=cmd_text_gen)
209
+
210
+ # grade (shared across all products)
211
+ gr = sub.add_parser("grade", help="grade any dataset (jsonl/json/csv) for quality")
212
+ gr.add_argument("path", help="dataset to grade")
213
+ gr.add_argument("--against", help="held-out eval set for the contamination check")
214
+ gr.add_argument("--field", action="append", default=[],
215
+ help="field(s) to analyze (repeatable; default: auto-detect)")
216
+ gr.add_argument("--ngram", type=int, default=8, help="contamination n-gram size")
217
+ gr.add_argument("--min-grade", choices=_GRADE_ORDER,
218
+ help="exit non-zero if the grade is below this (CI gate)")
219
+ gr.add_argument("--semantic", action="store_true",
220
+ help="add an embedding-based semantic-dedup axis")
221
+ gr.add_argument("--embed-provider", default="ollama", choices=["ollama", "openai"],
222
+ help="embedder for --semantic (default: ollama)")
223
+ gr.add_argument("--embed-model", default="", help="embedding model name")
224
+ gr.add_argument("--json", help="write the report as JSON")
225
+ gr.add_argument("--html", help="write the report as HTML")
226
+ gr.add_argument("--no-color", action="store_true", help="disable ANSI colors")
227
+ gr.set_defaults(func=cmd_grade)
228
+
229
+ # list
230
+ ls = sub.add_parser("list", help="list products and built-in seeds")
231
+ ls.set_defaults(func=cmd_list)
232
+
233
+ # roadmap stubs
234
+ tb = sub.add_parser("tabular", help="Product B: schema-aware fixtures [roadmap]")
235
+ tb.set_defaults(func=cmd_coming_soon, product="tabular",
236
+ blurb="schema-aware fixtures with referential integrity")
237
+ pv = sub.add_parser("privacy", help="Product C: privacy-safe twins [roadmap]")
238
+ pv.set_defaults(func=cmd_coming_soon, product="privacy",
239
+ blurb="privacy-safe synthetic twins of real datasets")
240
+
241
+ return p
242
+
243
+
244
+ def main(argv: List[str] = None) -> None:
245
+ parser = build_parser()
246
+ args = parser.parse_args(argv)
247
+ if not getattr(args, "cmd", None):
248
+ parser.print_help()
249
+ sys.exit(0)
250
+ # bare `synthkit text` with no subcommand
251
+ if args.cmd == "text" and not getattr(args, "func", None):
252
+ parser.parse_args(["text", "--help"])
253
+ try:
254
+ sys.exit(args.func(args))
255
+ except SynthkitError as exc:
256
+ sys.exit(f"error: {exc}")
257
+
258
+
259
+ if __name__ == "__main__":
260
+ main()
synthkit/formats.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Convert generated records into common fine-tuning dataset formats.
2
+
3
+ raw what the generator emits (alpaca-ish: instruction/input/output, or prompt)
4
+ alpaca {instruction, input, output}
5
+ sharegpt {conversations: [{from: human, value}, {from: gpt, value}]}
6
+ openai {messages: [{role: system?}, {role: user}, {role: assistant}]}
7
+
8
+ Eval (prompt-only) records keep their prompt as the human/user turn with an
9
+ empty completion, so the same dataset can drive evaluation or be completed later.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ from typing import Any, Dict, List
14
+
15
+ from synthkit.models import SynthkitError
16
+
17
+ FORMATS = ("raw", "alpaca", "sharegpt", "openai")
18
+
19
+
20
+ def _parts(rec: Dict[str, Any]):
21
+ system = rec.get("system", "")
22
+ instruction = rec.get("instruction", rec.get("prompt", ""))
23
+ user_input = rec.get("input", "")
24
+ output = rec.get("output", rec.get("response", ""))
25
+ user = instruction if not user_input else f"{instruction}\n\n{user_input}"
26
+ return system, instruction, user_input, user, output
27
+
28
+
29
+ def to_format(records: List[Dict[str, Any]], fmt: str) -> List[Dict[str, Any]]:
30
+ if fmt not in FORMATS:
31
+ raise SynthkitError(f"unknown --format {fmt!r} (choose from {', '.join(FORMATS)})")
32
+ if fmt == "raw":
33
+ return records
34
+
35
+ out: List[Dict[str, Any]] = []
36
+ for rec in records:
37
+ system, instruction, user_input, user, output = _parts(rec)
38
+ if fmt == "alpaca":
39
+ out.append({"instruction": instruction, "input": user_input, "output": output})
40
+ elif fmt == "sharegpt":
41
+ convo = []
42
+ if system:
43
+ convo.append({"from": "system", "value": system})
44
+ convo.append({"from": "human", "value": user})
45
+ convo.append({"from": "gpt", "value": output})
46
+ out.append({"conversations": convo})
47
+ elif fmt == "openai":
48
+ msgs = []
49
+ if system:
50
+ msgs.append({"role": "system", "content": system})
51
+ msgs.append({"role": "user", "content": user})
52
+ msgs.append({"role": "assistant", "content": output})
53
+ out.append({"messages": msgs})
54
+ return out
synthkit/grading.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """The grading engine, the part every synthkit product shares.
2
+
3
+ Given a list of records, score the dataset on four axes:
4
+
5
+ validity structurally sound records (required fields, non-empty, sane length)
6
+ uniqueness free of exact and near-duplicate records (MinHash + LSH)
7
+ diversity lexical variety across the set (distinct-n, self-similarity)
8
+ contamination overlap with a held-out eval/benchmark set (n-gram containment)
9
+
10
+ Everything here is standard-library only and deterministic for a fixed seed.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+ import math
16
+ import random
17
+ import re
18
+ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
19
+
20
+ from synthkit.models import DimensionScore, GradeReport, to_grade
21
+
22
+ _MERSENNE = (1 << 61) - 1
23
+ _WORD = re.compile(r"[a-z0-9]+")
24
+ # Content fields we analyze by default (role/config keys like "system" excluded).
25
+ _KNOWN_FIELDS = ("instruction", "input", "prompt", "question",
26
+ "output", "response", "answer", "text")
27
+
28
+
29
+ # ---- text helpers ------------------------------------------------------------
30
+
31
+ def record_text(rec: Dict[str, Any], fields: Optional[Sequence[str]]) -> str:
32
+ """Flatten the fields we analyze into a single string."""
33
+ if fields:
34
+ vals = [rec.get(f, "") for f in fields]
35
+ else:
36
+ keys = [k for k in _KNOWN_FIELDS if k in rec]
37
+ if not keys:
38
+ keys = [k for k, v in rec.items() if isinstance(v, str)]
39
+ vals = [rec.get(k, "") for k in keys]
40
+ return " ".join(str(v) for v in vals if v is not None)
41
+
42
+
43
+ def tokens(text: str) -> List[str]:
44
+ return _WORD.findall(text.lower())
45
+
46
+
47
+ def _stable_hash(s: str) -> int:
48
+ return int.from_bytes(hashlib.blake2b(s.encode("utf-8"), digest_size=8).digest(), "big")
49
+
50
+
51
+ def shingles(toks: Sequence[str], k: int) -> Set[str]:
52
+ if not toks:
53
+ return set()
54
+ if len(toks) < k:
55
+ return {" ".join(toks)}
56
+ return {" ".join(toks[i:i + k]) for i in range(len(toks) - k + 1)}
57
+
58
+
59
+ def ngram_set(toks: Sequence[str], n: int) -> Set[Tuple[str, ...]]:
60
+ """Word n-grams as tuples; items shorter than n contribute one whole tuple."""
61
+ if not toks:
62
+ return set()
63
+ if len(toks) <= n:
64
+ return {tuple(toks)}
65
+ return {tuple(toks[i:i + n]) for i in range(len(toks) - n + 1)}
66
+
67
+
68
+ # ---- MinHash + LSH near-duplicate detection ----------------------------------
69
+
70
+ class _MinHasher:
71
+ def __init__(self, num_perm: int, seed: int) -> None:
72
+ rng = random.Random(seed)
73
+ self.a = [rng.randrange(1, _MERSENNE) for _ in range(num_perm)]
74
+ self.b = [rng.randrange(0, _MERSENNE) for _ in range(num_perm)]
75
+
76
+ def sign(self, shs: Set[str]) -> Optional[Tuple[int, ...]]:
77
+ if not shs:
78
+ return None
79
+ base = [_stable_hash(s) for s in shs]
80
+ return tuple(min((a * h + b) % _MERSENNE for h in base)
81
+ for a, b in zip(self.a, self.b))
82
+
83
+
84
+ class _UnionFind:
85
+ def __init__(self, n: int) -> None:
86
+ self.parent = list(range(n))
87
+
88
+ def find(self, x: int) -> int:
89
+ while self.parent[x] != x:
90
+ self.parent[x] = self.parent[self.parent[x]]
91
+ x = self.parent[x]
92
+ return x
93
+
94
+ def union(self, x: int, y: int) -> None:
95
+ rx, ry = self.find(x), self.find(y)
96
+ if rx == ry:
97
+ return
98
+ # keep the smaller index as root (the "original" of the cluster)
99
+ if rx < ry:
100
+ self.parent[ry] = rx
101
+ else:
102
+ self.parent[rx] = ry
103
+
104
+
105
+ class _LSHIndex:
106
+ """MinHash + LSH index over a list of shingle sets.
107
+
108
+ Build once, then `candidates(shingles)` returns the (small) set of indices that
109
+ share at least one band with the query, turning all-pairs similarity work into
110
+ near-linear candidate lookups. Used for both near-dup detection and the
111
+ contamination check so neither is quadratic in the dataset size.
112
+ """
113
+
114
+ def __init__(self, shingle_sets: List[Set[str]], *, num_perm: int = 64,
115
+ bands: int = 16, seed: int = 17) -> None:
116
+ self._hasher = _MinHasher(num_perm, seed)
117
+ self._bands = bands
118
+ self._rows = num_perm // bands
119
+ self._buckets: Dict[Tuple[int, Tuple[int, ...]], List[int]] = {}
120
+ for idx, shingset in enumerate(shingle_sets):
121
+ sig = self._hasher.sign(shingset)
122
+ if sig is None:
123
+ continue
124
+ for band in range(bands):
125
+ key = (band, sig[band * self._rows:(band + 1) * self._rows])
126
+ self._buckets.setdefault(key, []).append(idx)
127
+
128
+ def candidates(self, shingset: Set[str]) -> Set[int]:
129
+ sig = self._hasher.sign(shingset)
130
+ if sig is None:
131
+ return set()
132
+ out: Set[int] = set()
133
+ for band in range(self._bands):
134
+ key = (band, sig[band * self._rows:(band + 1) * self._rows])
135
+ out.update(self._buckets.get(key, ()))
136
+ return out
137
+
138
+
139
+ def _duplicate_map(shingle_sets: List[Set[str]], *, threshold: float = 0.8,
140
+ seed: int = 17) -> Dict[int, int]:
141
+ """Return {pos: root_pos} for every entry that near-duplicates an earlier one."""
142
+ index = _LSHIndex(shingle_sets, seed=seed)
143
+ uf = _UnionFind(len(shingle_sets))
144
+ for i, shingset in enumerate(shingle_sets):
145
+ if not shingset:
146
+ continue
147
+ for j in index.candidates(shingset):
148
+ if j >= i:
149
+ continue
150
+ other = shingle_sets[j]
151
+ if other and len(shingset & other) / len(shingset | other) >= threshold:
152
+ uf.union(i, j)
153
+
154
+ dup_of: Dict[int, int] = {}
155
+ for idx in range(len(shingle_sets)):
156
+ root = uf.find(idx)
157
+ if root != idx:
158
+ dup_of[idx] = root
159
+ return dup_of
160
+
161
+
162
+ # ---- the four dimensions -----------------------------------------------------
163
+
164
+ def _validity_dim(records, texts, *, min_words, max_words) -> DimensionScore:
165
+ n = len(records)
166
+ empty = short = long = 0
167
+ examples: List[str] = []
168
+ for text in texts:
169
+ wc = len(tokens(text))
170
+ if not text.strip():
171
+ empty += 1
172
+ if len(examples) < 3:
173
+ examples.append("empty record")
174
+ elif wc < min_words:
175
+ short += 1
176
+ if len(examples) < 3:
177
+ examples.append(f"only {wc} words: {text[:60]!r}")
178
+ elif wc > max_words:
179
+ long += 1
180
+ if len(examples) < 3:
181
+ examples.append(f"{wc} words (over {max_words})")
182
+ bad = empty + short + long
183
+ score = 100.0 * (1 - bad / n) if n else 0.0
184
+ findings: List[str] = []
185
+ if empty:
186
+ findings.append(f"{empty} empty record(s)")
187
+ if short:
188
+ findings.append(f"{short} below {min_words} words")
189
+ if long:
190
+ findings.append(f"{long} above {max_words} words")
191
+ findings += examples
192
+ return DimensionScore(
193
+ "validity", "Validity", round(score, 1),
194
+ f"{n - bad}/{n} records well-formed", findings,
195
+ {"empty": empty, "too_short": short, "too_long": long, "n": n}, weight=0.25)
196
+
197
+
198
+ def _uniqueness_dim(records, texts, shingle_sets, *, seed) -> DimensionScore:
199
+ n = len(records)
200
+ seen: Dict[str, int] = {}
201
+ exact_dup: Set[int] = set()
202
+ rep_indices: List[int] = []
203
+ for idx, text in enumerate(texts):
204
+ norm = " ".join(tokens(text))
205
+ if norm and norm in seen:
206
+ exact_dup.add(idx)
207
+ else:
208
+ if norm:
209
+ seen[norm] = idx
210
+ rep_indices.append(idx)
211
+ # near-dup search runs only over exact-unique representatives (keeps it cheap)
212
+ rep_shingles = [shingle_sets[i] for i in rep_indices]
213
+ local_dup = _duplicate_map(rep_shingles, seed=seed)
214
+ near_dup: Set[int] = set()
215
+ clusters: Set[int] = set()
216
+ example = ""
217
+ for local_idx, local_root in local_dup.items():
218
+ gi, gr = rep_indices[local_idx], rep_indices[local_root]
219
+ near_dup.add(gi)
220
+ clusters.add(gr)
221
+ if not example:
222
+ example = f"e.g. #{gi} ≈ #{gr}: {texts[gi][:64]!r}"
223
+ dup_total = len(exact_dup) + len(near_dup)
224
+ score = 100.0 * (1 - dup_total / n) if n else 0.0
225
+ findings: List[str] = []
226
+ if exact_dup:
227
+ findings.append(f"{len(exact_dup)} exact duplicate(s)")
228
+ if near_dup:
229
+ findings.append(f"{len(near_dup)} near-duplicate(s) in {len(clusters)} cluster(s)")
230
+ if example:
231
+ findings.append(example)
232
+ return DimensionScore(
233
+ "uniqueness", "Uniqueness", round(score, 1),
234
+ f"{n - dup_total}/{n} unique ({len(exact_dup)} exact, {len(near_dup)} near)",
235
+ findings,
236
+ {"exact": len(exact_dup), "near": len(near_dup),
237
+ "clusters": len(clusters), "n": n}, weight=0.30)
238
+
239
+
240
+ def _diversity_dim(texts, shingle_sets, *, seed) -> DimensionScore:
241
+ unigrams: List[str] = []
242
+ bigrams: List[Tuple[str, str]] = []
243
+ for text in texts:
244
+ ts = tokens(text)
245
+ unigrams.extend(ts)
246
+ bigrams.extend(zip(ts, ts[1:]))
247
+ d1 = len(set(unigrams)) / len(unigrams) if unigrams else 0.0
248
+ d2 = len(set(bigrams)) / len(bigrams) if bigrams else 0.0
249
+ # self-similarity: mean Jaccard over a seeded sample of record pairs
250
+ rng = random.Random(seed)
251
+ idxs = [i for i, s in enumerate(shingle_sets) if s]
252
+ sims: List[float] = []
253
+ if len(idxs) >= 2:
254
+ for _ in range(min(2000, len(idxs) * 4)):
255
+ i, j = rng.sample(idxs, 2)
256
+ a, b = shingle_sets[i], shingle_sets[j]
257
+ sims.append(len(a & b) / len(a | b))
258
+ self_sim = sum(sims) / len(sims) if sims else 0.0
259
+ # Pairwise distinctness (1 - self-similarity) is the size-stable signal and
260
+ # leads; distinct-2 is a secondary lexical-variety term with a lenient target
261
+ # (corpus-level distinct-n shrinks as the set grows). Raw numbers are reported
262
+ # in stats either way so the letter is never the whole story.
263
+ score = 100.0 * (0.6 * (1 - self_sim) + 0.4 * min(1, d2 / 0.25))
264
+ findings: List[str] = []
265
+ if d2 < 0.4:
266
+ findings.append("low bigram diversity, templates may be too repetitive")
267
+ if self_sim > 0.3:
268
+ findings.append(f"records are {self_sim * 100:.0f}% similar on average")
269
+ return DimensionScore(
270
+ "diversity", "Diversity", round(score, 1),
271
+ f"distinct-2 {d2:.2f} · distinct-1 {d1:.2f} · self-sim {self_sim:.2f}",
272
+ findings,
273
+ {"distinct_1": round(d1, 4), "distinct_2": round(d2, 4),
274
+ "self_similarity": round(self_sim, 4), "vocab": len(set(unigrams))},
275
+ weight=0.25)
276
+
277
+
278
+ def _contamination_dim(texts, shingle_sets, against_texts, *, ngram) -> DimensionScore:
279
+ if against_texts is None:
280
+ return DimensionScore(
281
+ "contamination", "Contamination", None,
282
+ "no eval set provided (pass --against to check)", [], {}, weight=0.20)
283
+ eval_ngrams: Set[Tuple[str, ...]] = set()
284
+ eval_shingles: List[Set[str]] = []
285
+ for t in against_texts:
286
+ ts = tokens(t)
287
+ eval_ngrams |= ngram_set(ts, ngram)
288
+ eval_shingles.append(shingles(ts, 5))
289
+ eval_index = _LSHIndex(eval_shingles, seed=17) # avoid the O(records×eval) scan
290
+ flagged: List[Tuple[int, str, str]] = []
291
+ for idx, text in enumerate(texts):
292
+ ts = tokens(text)
293
+ sh = shingle_sets[idx]
294
+ hit, reason = False, ""
295
+ if sh: # near-duplicate of an eval item
296
+ for j in eval_index.candidates(sh):
297
+ es = eval_shingles[j]
298
+ if es and len(sh & es) / len(sh | es) >= 0.7:
299
+ hit, reason = True, "near-duplicate of an eval item"
300
+ break
301
+ if not hit:
302
+ # n-gram containment: what fraction of THIS record's n-grams are in the
303
+ # eval set. Robust to shared template boilerplate (only a few n-grams),
304
+ # which a raw "shares any n-gram" check would over-flag.
305
+ grams = ngram_set(ts, ngram)
306
+ if grams:
307
+ contained = sum(1 for g in grams if g in eval_ngrams) / len(grams)
308
+ if contained >= 0.8:
309
+ hit = True
310
+ reason = f"{contained * 100:.0f}% of its {ngram}-grams are in the eval set"
311
+ if hit:
312
+ flagged.append((idx, reason, text[:64]))
313
+ n = len(texts)
314
+ score = 100.0 * (1 - len(flagged) / n) if n else 100.0
315
+ summary = (f"{len(flagged)}/{n} records overlap the eval set" if flagged
316
+ else f"clean: 0/{n} overlap the eval set")
317
+ findings = [f"#{i}: {why}: {snip!r}" for i, why, snip in flagged[:4]]
318
+ return DimensionScore(
319
+ "contamination", "Contamination", round(score, 1), summary, findings,
320
+ {"flagged": len(flagged), "n": n, "ngram": ngram}, weight=0.20)
321
+
322
+
323
+ # ---- optional semantic axis (embedding-based) --------------------------------
324
+
325
+ def _unit(v: Sequence[float]) -> List[float]:
326
+ norm = math.sqrt(sum(x * x for x in v)) or 1.0
327
+ return [x / norm for x in v]
328
+
329
+
330
+ def _cos_unit(a: Sequence[float], b: Sequence[float]) -> float:
331
+ return sum(x * y for x, y in zip(a, b))
332
+
333
+
334
+ def _semantic_dups(units: List[List[float]], threshold: float) -> Set[int]:
335
+ n = len(units)
336
+ dup: Set[int] = set()
337
+ try:
338
+ import numpy as np # fast path if available
339
+ sims = np.asarray(units) @ np.asarray(units).T
340
+ for a in range(n):
341
+ if a in dup:
342
+ continue
343
+ row = sims[a]
344
+ for b in range(a + 1, n):
345
+ if b not in dup and row[b] >= threshold:
346
+ dup.add(b)
347
+ except ImportError:
348
+ for a in range(n):
349
+ if a in dup:
350
+ continue
351
+ ua = units[a]
352
+ for b in range(a + 1, n):
353
+ if b not in dup and _cos_unit(ua, units[b]) >= threshold:
354
+ dup.add(b)
355
+ return dup
356
+
357
+
358
+ def _semantic_dim(texts, embedder, *, threshold, seed, max_n=400) -> DimensionScore:
359
+ idx = [i for i, t in enumerate(texts) if t.strip()]
360
+ note = ""
361
+ if len(idx) > max_n:
362
+ idx = sorted(random.Random(seed).sample(idx, max_n))
363
+ note = f" (sampled {max_n})"
364
+ units = [_unit(v) for v in embedder.embed([texts[i] for i in idx])]
365
+ dup = _semantic_dups(units, threshold)
366
+ rng = random.Random(seed)
367
+ sims = []
368
+ if len(units) >= 2:
369
+ for _ in range(min(2000, len(units) * 4)):
370
+ a, b = rng.sample(range(len(units)), 2)
371
+ sims.append(_cos_unit(units[a], units[b]))
372
+ mean_sim = sum(sims) / len(sims) if sims else 0.0
373
+ n = len(idx)
374
+ score = 100.0 * (1 - len(dup) / n) if n else 100.0
375
+ findings = []
376
+ if dup:
377
+ findings.append(f"{len(dup)} semantic near-duplicate(s) at cosine ≥ {threshold} "
378
+ "(paraphrases lexical dedup misses)")
379
+ summary = f"{n - len(dup)}/{n} semantically distinct · mean cosine {mean_sim:.2f}{note}"
380
+ return DimensionScore(
381
+ "semantic", "Semantic dedup", round(score, 1), summary, findings,
382
+ {"semantic_dups": len(dup), "mean_cosine": round(mean_sim, 4),
383
+ "n": n, "threshold": threshold}, weight=0.20)
384
+
385
+
386
+ # ---- public API --------------------------------------------------------------
387
+
388
+ def grade_dataset(records: List[Dict[str, Any]], *,
389
+ fields: Optional[Sequence[str]] = None,
390
+ against: Optional[List[Dict[str, Any]]] = None,
391
+ against_fields: Optional[Sequence[str]] = None,
392
+ ngram: int = 8, min_words: int = 3, max_words: int = 512,
393
+ seed: int = 17, embedder=None,
394
+ semantic_threshold: float = 0.83) -> GradeReport:
395
+ if not records:
396
+ return GradeReport(grade="F", score=0.0, n_records=0, dimensions=[],
397
+ meta={"note": "empty dataset"})
398
+
399
+ texts = [record_text(r, fields) for r in records]
400
+ shingle_sets = [shingles(tokens(t), 5) for t in texts]
401
+ against_texts = None
402
+ if against is not None:
403
+ against_texts = [record_text(r, against_fields or fields) for r in against]
404
+
405
+ dims = [
406
+ _validity_dim(records, texts, min_words=min_words, max_words=max_words),
407
+ _uniqueness_dim(records, texts, shingle_sets, seed=seed),
408
+ _diversity_dim(texts, shingle_sets, seed=seed),
409
+ _contamination_dim(texts, shingle_sets, against_texts, ngram=ngram),
410
+ ]
411
+ if embedder is not None:
412
+ dims.append(_semantic_dim(texts, embedder, threshold=semantic_threshold, seed=seed))
413
+
414
+ applicable = [d for d in dims if d.applicable]
415
+ wsum = sum(d.weight for d in applicable) or 1.0
416
+ overall = sum(d.score * d.weight for d in applicable) / wsum
417
+ return GradeReport(
418
+ grade=to_grade(overall), score=round(overall, 1), n_records=len(records),
419
+ dimensions=dims,
420
+ meta={"fields": list(fields) if fields else "auto",
421
+ "has_eval": against_texts is not None,
422
+ "semantic": embedder is not None})
synthkit/io_utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Read and write datasets as JSONL, JSON, or CSV, standard library only."""
2
+ from __future__ import annotations
3
+
4
+ import csv
5
+ import json
6
+ import os
7
+ from typing import Any, Dict, List
8
+
9
+ from synthkit.models import SynthkitError
10
+
11
+
12
+ def _ext(path: str) -> str:
13
+ return os.path.splitext(path)[1].lower()
14
+
15
+
16
+ def read_records(path: str) -> List[Dict[str, Any]]:
17
+ """Load records from .jsonl / .json / .csv / .tsv."""
18
+ ext = _ext(path)
19
+ with open(path, "r", encoding="utf-8") as fh:
20
+ if ext in (".jsonl", ".ndjson"):
21
+ return [json.loads(line) for line in fh if line.strip()]
22
+ if ext == ".json":
23
+ data = json.load(fh)
24
+ if isinstance(data, dict):
25
+ # common wrappers: {"data": [...]} / {"records": [...]}
26
+ for key in ("data", "records", "rows", "examples"):
27
+ if isinstance(data.get(key), list):
28
+ return data[key]
29
+ return [data]
30
+ return list(data)
31
+ if ext in (".csv", ".tsv"):
32
+ delim = "\t" if ext == ".tsv" else ","
33
+ return list(csv.DictReader(fh, delimiter=delim))
34
+ raise SynthkitError(f"unsupported input format {ext or path!r}")
35
+
36
+
37
+ def write_jsonl(path: str, records: List[Dict[str, Any]]) -> None:
38
+ _ensure_dir(path)
39
+ with open(path, "w", encoding="utf-8") as fh:
40
+ for r in records:
41
+ fh.write(json.dumps(r, ensure_ascii=False) + "\n")
42
+
43
+
44
+ def write_text(path: str, text: str) -> None:
45
+ _ensure_dir(path)
46
+ with open(path, "w", encoding="utf-8") as fh:
47
+ fh.write(text)
48
+
49
+
50
+ def load_spec(path: str) -> Dict[str, Any]:
51
+ """Load a seed spec from JSON (always) or YAML (if pyyaml is installed)."""
52
+ ext = _ext(path)
53
+ with open(path, "r", encoding="utf-8") as fh:
54
+ if ext in (".yaml", ".yml"):
55
+ try:
56
+ import yaml # optional dependency
57
+ except ImportError as exc: # pragma: no cover
58
+ raise SynthkitError(
59
+ "reading YAML seeds needs pyyaml. "
60
+ "`pip install pyyaml`, or use a .json seed."
61
+ ) from exc
62
+ return yaml.safe_load(fh)
63
+ return json.load(fh)
64
+
65
+
66
+ def _ensure_dir(path: str) -> None:
67
+ d = os.path.dirname(os.path.abspath(path))
68
+ os.makedirs(d, exist_ok=True)
synthkit/models.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core data types shared across every synthkit product."""
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass, field
5
+ from typing import Any, Dict, List, Optional
6
+
7
+
8
+ class SynthkitError(Exception):
9
+ """User-facing error (bad seed spec, unreadable input, provider failure).
10
+
11
+ Library code raises this instead of calling sys.exit/SystemExit, so that
12
+ callers embedding the library (the Gradio app, the tests) can catch it.
13
+ The CLI converts it into a clean non-zero exit.
14
+ """
15
+
16
+
17
+ # Letter grades, best to worst, same scale as the rest of the portfolio.
18
+ GRADE_BANDS = [
19
+ (97, "A+"), (93, "A"), (85, "B"), (75, "C"), (65, "D"), (0, "F"),
20
+ ]
21
+
22
+
23
+ def to_grade(score: float) -> str:
24
+ for cutoff, letter in GRADE_BANDS:
25
+ if score >= cutoff:
26
+ return letter
27
+ return "F"
28
+
29
+
30
+ @dataclass
31
+ class DimensionScore:
32
+ """One quality axis (validity, uniqueness, diversity, contamination)."""
33
+
34
+ key: str
35
+ title: str
36
+ score: Optional[float] # 0–100, or None when N/A
37
+ summary: str = ""
38
+ findings: List[str] = field(default_factory=list) # human-readable notes
39
+ stats: Dict[str, Any] = field(default_factory=dict) # raw numbers
40
+ weight: float = 1.0
41
+
42
+ @property
43
+ def applicable(self) -> bool:
44
+ return self.score is not None
45
+
46
+
47
+ @dataclass
48
+ class GradeReport:
49
+ """The graded result for a dataset."""
50
+
51
+ grade: str
52
+ score: float # 0–100 overall
53
+ n_records: int
54
+ dimensions: List[DimensionScore] = field(default_factory=list)
55
+ meta: Dict[str, Any] = field(default_factory=dict)
56
+
57
+ def dim(self, key: str) -> Optional[DimensionScore]:
58
+ for d in self.dimensions:
59
+ if d.key == key:
60
+ return d
61
+ return None
synthkit/privacy/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Product C: privacy-safe synthetic twins of real datasets. [roadmap]
2
+
3
+ Fits a generator to a real dataset, emits a statistically similar synthetic
4
+ copy, and adds privacy/utility dimensions (membership-inference distance,
5
+ column-distribution fidelity) to the shared grading report.
6
+ """
synthkit/providers.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Response + embedding providers.
2
+
3
+ A *provider* turns a prompt into a response (for instruction→output pairs); an
4
+ *embedder* turns text into a vector (for the optional semantic-quality axis).
5
+ The 'none' path is stdlib; ollama is local & free; anthropic/openai are lazy
6
+ imports used only if selected.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ import urllib.error
13
+ import urllib.request
14
+ from typing import List, Optional
15
+
16
+ from synthkit.models import SynthkitError
17
+
18
+ # ---- HTTP helper with friendly Ollama errors ---------------------------------
19
+
20
+ def _post_json(url: str, payload: dict, timeout: int = 120) -> dict:
21
+ body = json.dumps(payload).encode("utf-8")
22
+ req = urllib.request.Request(url, data=body,
23
+ headers={"Content-Type": "application/json"})
24
+ try:
25
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
26
+ return json.loads(resp.read().decode("utf-8"))
27
+ except urllib.error.HTTPError as exc:
28
+ detail = exc.read().decode("utf-8", "ignore")[:200]
29
+ hint = ""
30
+ if exc.code == 404 and "model" in detail.lower():
31
+ hint = ". Pull it first with `ollama pull <model>`"
32
+ raise SynthkitError(f"Ollama returned HTTP {exc.code} from {url}{hint}\n {detail}")
33
+ except urllib.error.URLError as exc:
34
+ raise SynthkitError(
35
+ f"can't reach Ollama at {url} ({exc.reason}). "
36
+ "Is the daemon running? Start it with `ollama serve`.")
37
+
38
+
39
+ # ---- response providers ------------------------------------------------------
40
+
41
+ class Provider:
42
+ name = "base"
43
+
44
+ def generate(self, prompt: str, system: str = "") -> str:
45
+ raise NotImplementedError
46
+
47
+
48
+ class OllamaProvider(Provider):
49
+ """Local, free responses via a running Ollama daemon."""
50
+
51
+ name = "ollama"
52
+
53
+ def __init__(self, model: str = "llama3.2", host: str = "") -> None:
54
+ self.model = model
55
+ self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/")
56
+
57
+ def generate(self, prompt: str, system: str = "") -> str:
58
+ data = _post_json(self.host + "/api/generate", {
59
+ "model": self.model, "prompt": prompt,
60
+ "system": system, "stream": False,
61
+ })
62
+ return (data.get("response") or "").strip()
63
+
64
+
65
+ class AnthropicProvider(Provider):
66
+ name = "anthropic"
67
+
68
+ def __init__(self, model: str = "claude-haiku-4-5-20251001") -> None:
69
+ try:
70
+ import anthropic
71
+ except ImportError as exc:
72
+ raise SynthkitError("--provider anthropic needs the anthropic SDK. "
73
+ "`pip install anthropic`.") from exc
74
+ self.model = model
75
+ self._client = anthropic.Anthropic()
76
+
77
+ def generate(self, prompt: str, system: str = "") -> str:
78
+ msg = self._client.messages.create(
79
+ model=self.model, max_tokens=1024,
80
+ system=system or "You are a helpful assistant.",
81
+ messages=[{"role": "user", "content": prompt}])
82
+ return "".join(b.text for b in msg.content
83
+ if getattr(b, "type", "") == "text").strip()
84
+
85
+
86
+ class OpenAIProvider(Provider):
87
+ name = "openai"
88
+
89
+ def __init__(self, model: str = "gpt-4o-mini") -> None:
90
+ try:
91
+ import openai
92
+ except ImportError as exc:
93
+ raise SynthkitError("--provider openai needs the openai SDK. "
94
+ "`pip install openai`.") from exc
95
+ self.model = model
96
+ self._client = openai.OpenAI()
97
+
98
+ def generate(self, prompt: str, system: str = "") -> str:
99
+ resp = self._client.chat.completions.create(
100
+ model=self.model,
101
+ messages=[{"role": "system", "content": system or "You are a helpful assistant."},
102
+ {"role": "user", "content": prompt}])
103
+ return (resp.choices[0].message.content or "").strip()
104
+
105
+
106
+ def get_provider(name: Optional[str], model: str = "") -> Optional[Provider]:
107
+ if not name or name == "none":
108
+ return None
109
+ if name == "ollama":
110
+ return OllamaProvider(model or "llama3.2")
111
+ if name == "anthropic":
112
+ return AnthropicProvider(model or "claude-haiku-4-5-20251001")
113
+ if name == "openai":
114
+ return OpenAIProvider(model or "gpt-4o-mini")
115
+ raise SynthkitError(f"unknown provider {name!r}")
116
+
117
+
118
+ # ---- embedders (for the optional semantic axis) ------------------------------
119
+
120
+ class Embedder:
121
+ name = "base"
122
+
123
+ def embed(self, texts: List[str]) -> List[List[float]]:
124
+ raise NotImplementedError
125
+
126
+
127
+ class OllamaEmbedder(Embedder):
128
+ name = "ollama"
129
+
130
+ def __init__(self, model: str = "nomic-embed-text", host: str = "") -> None:
131
+ self.model = model
132
+ self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/")
133
+
134
+ def embed(self, texts: List[str]) -> List[List[float]]:
135
+ out: List[List[float]] = []
136
+ for t in texts:
137
+ data = _post_json(self.host + "/api/embeddings",
138
+ {"model": self.model, "prompt": t})
139
+ vec = data.get("embedding")
140
+ if not vec:
141
+ raise SynthkitError(f"Ollama embedder returned no vector for model {self.model!r}")
142
+ out.append(vec)
143
+ return out
144
+
145
+
146
+ class OpenAIEmbedder(Embedder):
147
+ name = "openai"
148
+
149
+ def __init__(self, model: str = "text-embedding-3-small") -> None:
150
+ try:
151
+ import openai
152
+ except ImportError as exc:
153
+ raise SynthkitError("--embed-provider openai needs the openai SDK. "
154
+ "`pip install openai`.") from exc
155
+ self.model = model
156
+ self._client = openai.OpenAI()
157
+
158
+ def embed(self, texts: List[str]) -> List[List[float]]:
159
+ resp = self._client.embeddings.create(model=self.model, input=texts)
160
+ return [d.embedding for d in resp.data]
161
+
162
+
163
+ def get_embedder(name: Optional[str], model: str = "") -> Optional[Embedder]:
164
+ if not name or name == "none":
165
+ return None
166
+ if name == "ollama":
167
+ return OllamaEmbedder(model or "nomic-embed-text")
168
+ if name == "openai":
169
+ return OpenAIEmbedder(model or "text-embedding-3-small")
170
+ raise SynthkitError(f"unknown embed provider {name!r}")
synthkit/report.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Render a GradeReport: terminal, JSON, and a standalone HTML report."""
2
+ from __future__ import annotations
3
+
4
+ import html
5
+ import json
6
+ from typing import List
7
+
8
+ from synthkit import __version__
9
+ from synthkit.models import GradeReport, to_grade
10
+
11
+ # ---- ANSI helpers ------------------------------------------------------------
12
+ _C = {
13
+ "reset": "\033[0m", "bold": "\033[1m", "dim": "\033[2m",
14
+ "red": "\033[31m", "green": "\033[32m", "yellow": "\033[33m",
15
+ "blue": "\033[34m", "magenta": "\033[35m", "cyan": "\033[36m",
16
+ }
17
+ GRADE_COLOR = {"A+": "green", "A": "green", "B": "cyan", "C": "yellow", "D": "yellow", "F": "red"}
18
+ GRADE_HEX = {"A+": "#16a34a", "A": "#16a34a", "B": "#0891b2", "C": "#ca8a04",
19
+ "D": "#ea580c", "F": "#dc2626"}
20
+
21
+
22
+ def _c(text: str, color: str) -> str:
23
+ return f"{_C.get(color, '')}{text}{_C['reset']}"
24
+
25
+
26
+ def _bar(score: float, width: int = 21) -> str:
27
+ fill = int(round(score / 100 * width))
28
+ return "█" * fill + "░" * (width - fill)
29
+
30
+
31
+ def _score_color(score: float) -> str:
32
+ return GRADE_COLOR.get(to_grade(score), "yellow")
33
+
34
+
35
+ def _hex(score: float) -> str:
36
+ return GRADE_HEX.get(to_grade(score), "#ca8a04")
37
+
38
+
39
+ def print_report(report: GradeReport, dataset: str = "", use_color: bool = True) -> None:
40
+ def col(t, c):
41
+ return _c(t, c) if use_color else t
42
+
43
+ g = report.grade
44
+ print()
45
+ print(col(" ┌─ synthkit · data quality ──────────────────────────", "dim"))
46
+ print(" │")
47
+ print(f" │ Quality grade {col(g, GRADE_COLOR.get(g, 'yellow'))} ({report.score}/100)")
48
+ print(f" │ Records {report.n_records}")
49
+ if dataset:
50
+ print(f" │ Dataset {dataset}")
51
+ print(" │")
52
+ print(col(" └────────────────────────────────────────────────────", "dim"))
53
+
54
+ print(f"\n {col('DIMENSIONS', 'bold')}\n")
55
+ for d in report.dimensions:
56
+ if d.applicable:
57
+ mark = col("●", _score_color(d.score))
58
+ bar = col(_bar(d.score), _score_color(d.score))
59
+ print(f" {mark} {d.title:<14}{d.score:>5.0f} {bar} {col(d.summary, 'dim')}")
60
+ else:
61
+ print(f" {col('○', 'dim')} {d.title:<14}{col(' n/a', 'dim')} {col(d.summary, 'dim')}")
62
+
63
+ notes = [(d.title, f) for d in report.dimensions for f in d.findings]
64
+ if notes:
65
+ print(f"\n {col('NOTES', 'bold')}\n")
66
+ for title, f in notes:
67
+ print(f" {col('▸ ' + title + ':', 'cyan')} {f}")
68
+
69
+ print(f"\n {col('Grade = weighted blend of the applicable axes · tune with --against / --field', 'dim')}")
70
+ print()
71
+
72
+
73
+ def to_json(report: GradeReport, dataset: str = "") -> str:
74
+ return json.dumps({
75
+ "dataset": dataset,
76
+ "grade": report.grade,
77
+ "score": report.score,
78
+ "records": report.n_records,
79
+ "dimensions": [
80
+ {"key": d.key, "title": d.title, "score": d.score,
81
+ "summary": d.summary, "findings": d.findings, "stats": d.stats}
82
+ for d in report.dimensions
83
+ ],
84
+ "meta": report.meta,
85
+ }, indent=2)
86
+
87
+
88
+ def to_html(report: GradeReport, dataset: str = "") -> str:
89
+ gcolor = GRADE_HEX.get(report.grade, "#ca8a04")
90
+
91
+ rows: List[str] = []
92
+ for d in report.dimensions:
93
+ if d.applicable:
94
+ bc = _hex(d.score)
95
+ rows.append(f"""
96
+ <div class="dim">
97
+ <div class="dim-head">
98
+ <span class="dt">{html.escape(d.title)}</span>
99
+ <span class="ds" style="color:{bc}">{d.score:.0f}</span>
100
+ </div>
101
+ <div class="track"><div class="fill" style="width:{d.score:.0f}%;background:{bc}"></div></div>
102
+ <div class="dsum">{html.escape(d.summary)}</div>
103
+ </div>""")
104
+ else:
105
+ rows.append(f"""
106
+ <div class="dim">
107
+ <div class="dim-head">
108
+ <span class="dt">{html.escape(d.title)}</span>
109
+ <span class="ds na">n/a</span>
110
+ </div>
111
+ <div class="dsum">{html.escape(d.summary)}</div>
112
+ </div>""")
113
+
114
+ notes = [f'<li><b>{html.escape(d.title)}:</b> {html.escape(f)}</li>'
115
+ for d in report.dimensions for f in d.findings]
116
+
117
+ return f"""<!doctype html><html lang="en"><head><meta charset="utf-8">
118
+ <meta name="viewport" content="width=device-width, initial-scale=1">
119
+ <title>synthkit report</title>
120
+ <style>
121
+ :root {{ color-scheme: light dark; }}
122
+ body {{ font: 15px/1.55 -apple-system, Segoe UI, Roboto, sans-serif; margin: 0;
123
+ background: #0b1020; color: #e5e7eb; }}
124
+ .wrap {{ max-width: 760px; margin: 0 auto; padding: 40px 24px 80px; }}
125
+ h1 {{ font-size: 20px; letter-spacing: .3px; margin: 0 0 4px; }}
126
+ .sub {{ color: #94a3b8; margin: 0 0 28px; font-size: 13px; }}
127
+ .hero {{ display: flex; gap: 24px; align-items: center; background: #111934;
128
+ border: 1px solid #1e293b; border-radius: 14px; padding: 24px; margin-bottom: 28px; }}
129
+ .grade {{ font-size: 56px; font-weight: 800; line-height: 1; color: {gcolor}; }}
130
+ .meta {{ flex: 1; }}
131
+ .meta .big {{ font-size: 15px; margin-bottom: 6px; }}
132
+ .meta .small {{ color: #94a3b8; font-size: 13px; }}
133
+ h2 {{ font-size: 13px; text-transform: uppercase; letter-spacing: 1px; color: #94a3b8;
134
+ margin: 32px 0 14px; }}
135
+ .dim {{ background: #111934; border: 1px solid #1e293b; border-radius: 12px;
136
+ padding: 14px 18px; margin-bottom: 10px; }}
137
+ .dim-head {{ display: flex; justify-content: space-between; align-items: baseline; }}
138
+ .dt {{ font-weight: 700; }}
139
+ .ds {{ font-size: 22px; font-weight: 800; }}
140
+ .ds.na {{ color: #64748b; font-size: 15px; font-weight: 600; }}
141
+ .track {{ height: 8px; background: #0b1020; border-radius: 999px; margin: 10px 0 8px; overflow: hidden; }}
142
+ .fill {{ height: 100%; border-radius: 999px; }}
143
+ .dsum {{ color: #cbd5e1; font-size: 13px; }}
144
+ ul.notes {{ list-style: none; padding: 0; margin: 0; }}
145
+ ul.notes li {{ background: #0e1830; border: 1px solid #1e293b; border-left: 3px solid #38bdf8;
146
+ border-radius: 8px; padding: 9px 14px; margin-bottom: 8px; font-size: 13.5px; }}
147
+ ul.notes b {{ color: #818cf8; }}
148
+ .foot {{ margin-top: 36px; color: #64748b; font-size: 12px; }}
149
+ </style></head><body><div class="wrap">
150
+ <h1>synthkit: synthetic data quality report</h1>
151
+ <p class="sub">validity · uniqueness · diversity · contamination</p>
152
+ <div class="hero">
153
+ <div class="grade">{report.grade}</div>
154
+ <div class="meta">
155
+ <div class="big">Quality score <b>{report.score}/100</b></div>
156
+ <div class="small">{report.n_records} records{(' · ' + html.escape(dataset)) if dataset else ''}</div>
157
+ </div>
158
+ </div>
159
+ <h2>Dimensions</h2>
160
+ {''.join(rows)}
161
+ <h2>Notes</h2>
162
+ {('<ul class="notes">' + ''.join(notes) + '</ul>') if notes else '<p style="color:#86efac">Nothing flagged.</p>'}
163
+ <p class="foot">Generated by synthkit v{__version__} · grade is a weighted blend of the applicable axes.</p>
164
+ </div></body></html>"""
synthkit/tabular/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Product B: schema-aware tabular fixtures with referential integrity. [roadmap]
2
+
3
+ Slots into the same core as Product A: it will emit records that the shared
4
+ grading engine (validity / uniqueness / diversity, plus tabular-specific
5
+ referential-integrity and PII-safety checks) scores with the same report.
6
+ """
synthkit/text/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Product A: synthetic instruction & evaluation datasets for LLMs."""
synthkit/text/generate.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate text records (eval prompts or instruction→output pairs) from a seed spec.
2
+
3
+ Two phases:
4
+ 1. sample prompts: deterministic for a fixed seed; exact-duplicate prompts are
5
+ skipped by default so naive slot collisions don't pad the dataset.
6
+ 2. fill responses: only for instruction data with response.mode == 'provider';
7
+ runs concurrently with a progress callback.
8
+
9
+ Optional: pass a `dedup_embedder` to dedup *by meaning as you generate*, each
10
+ candidate is embedded and rejected if it's within `dedup_threshold` cosine of an
11
+ already-accepted record.
12
+
13
+ Templates are rendered with a safe `{slot}`-only substitution (NOT str.format):
14
+ attribute/index access and format specs are treated as literal text, so an
15
+ untrusted template can't reach object internals or trigger a format-spec blow-up.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import random
20
+ import re
21
+ from typing import Any, Callable, Dict, List, Optional, Tuple
22
+
23
+ from synthkit.grading import record_text
24
+ from synthkit.models import SynthkitError
25
+ from synthkit.providers import Embedder, Provider
26
+ from synthkit.util import max_cosine, pmap, unit
27
+
28
+ _PLACEHOLDER = re.compile(r"\{(\w+)\}")
29
+ _MAX_RECORD_CHARS = 100_000 # guards against a pathological slot value
30
+
31
+
32
+ def render(template: str, fill: Dict[str, str]) -> str:
33
+ """Substitute only bare ``{slot}`` placeholders; everything else stays literal."""
34
+ def repl(match: "re.Match[str]") -> str:
35
+ key = match.group(1)
36
+ if key not in fill:
37
+ raise SynthkitError(f"template slot '{key}' is missing from 'slots'")
38
+ return str(fill[key])
39
+
40
+ out = _PLACEHOLDER.sub(repl, template)
41
+ if len(out) > _MAX_RECORD_CHARS:
42
+ raise SynthkitError(f"rendered record exceeds {_MAX_RECORD_CHARS} characters")
43
+ return out
44
+
45
+
46
+ def _rule_response(resp_cfg: Dict[str, Any], fill: Dict[str, str]) -> str:
47
+ return render(str(resp_cfg.get("template", "")), fill)
48
+
49
+
50
+ def _shape_record(kind: str, prompt: str, response: str,
51
+ system: str, domain: str) -> Dict[str, Any]:
52
+ if kind == "instruction":
53
+ rec: Dict[str, Any] = {"instruction": prompt, "input": "", "output": response}
54
+ if system:
55
+ rec["system"] = system
56
+ else:
57
+ rec = {"prompt": prompt}
58
+ if domain:
59
+ rec["domain"] = domain
60
+ return rec
61
+
62
+
63
+ def _response_for(kind: str, mode: str, resp_cfg: Dict[str, Any], fill: Dict[str, str],
64
+ prompt: str, system: str, provider: Optional[Provider]) -> str:
65
+ if kind != "instruction":
66
+ return ""
67
+ if mode == "none":
68
+ return ""
69
+ if mode == "rule":
70
+ return _rule_response(resp_cfg, fill)
71
+ if mode == "provider":
72
+ return provider.generate(prompt, system) # type: ignore[union-attr]
73
+ raise SynthkitError(f"unknown response.mode {mode!r}")
74
+
75
+
76
+ def sample_prompts(spec: Dict[str, Any], n: int, *, seed: int = 17,
77
+ dedup: bool = True,
78
+ max_attempts: Optional[int] = None
79
+ ) -> List[Tuple[str, Dict[str, str]]]:
80
+ """Phase 1: return up to n (prompt, slot-fill) pairs."""
81
+ templates = spec.get("templates") or []
82
+ if not templates:
83
+ raise SynthkitError("seed spec has no 'templates'")
84
+ slots: Dict[str, List[str]] = spec.get("slots") or {}
85
+ min_words = int((spec.get("constraints") or {}).get("min_words", 0))
86
+
87
+ rng = random.Random(seed)
88
+ out: List[Tuple[str, Dict[str, str]]] = []
89
+ seen: set = set()
90
+ attempts = 0
91
+ cap = max_attempts if max_attempts is not None else max(n * 50, 200)
92
+ while len(out) < n and attempts < cap:
93
+ attempts += 1
94
+ template = rng.choice(templates)
95
+ fill = {k: rng.choice(v) for k, v in slots.items()}
96
+ prompt = render(template, fill)
97
+ if len(prompt.split()) < min_words:
98
+ continue
99
+ if dedup:
100
+ key = " ".join(prompt.lower().split())
101
+ if key in seen:
102
+ continue
103
+ seen.add(key)
104
+ out.append((prompt, fill))
105
+ return out
106
+
107
+
108
+ def _generate_dedup(spec, n, provider, seed, dedup, progress,
109
+ embedder: Embedder, threshold: float,
110
+ stats: Optional[Dict[str, Any]]) -> List[Dict[str, Any]]:
111
+ kind = spec.get("kind", "eval")
112
+ system = spec.get("system", "")
113
+ domain = spec.get("domain", "")
114
+ resp_cfg = spec.get("response") or {"mode": "none"}
115
+ mode = resp_cfg.get("mode", "none")
116
+
117
+ pool = sample_prompts(spec, max(n * 5, n + 100), seed=seed, dedup=dedup)
118
+ records: List[Dict[str, Any]] = []
119
+ units: List[List[float]] = []
120
+ rejected = attempts = 0
121
+ for prompt, fill in pool:
122
+ if len(records) >= n:
123
+ break
124
+ attempts += 1
125
+ resp = _response_for(kind, mode, resp_cfg, fill, prompt, system, provider)
126
+ rec = _shape_record(kind, prompt, resp, system, domain)
127
+ u = unit(embedder.embed([record_text(rec, None)])[0])
128
+ if units and max_cosine(u, units) >= threshold:
129
+ rejected += 1
130
+ else:
131
+ records.append(rec)
132
+ units.append(u)
133
+ if progress:
134
+ progress(len(records), n)
135
+ if stats is not None:
136
+ stats["rejected_semantic"] = rejected
137
+ stats["attempts"] = attempts
138
+ stats["pool"] = len(pool)
139
+ return records
140
+
141
+
142
+ def generate(spec: Dict[str, Any], n: int, *, provider: Optional[Provider] = None,
143
+ seed: int = 17, dedup: bool = True,
144
+ max_attempts: Optional[int] = None, concurrency: int = 1,
145
+ progress: Optional[Callable[[int, int], None]] = None,
146
+ dedup_embedder: Optional[Embedder] = None,
147
+ dedup_threshold: float = 0.9,
148
+ stats: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
149
+ kind = spec.get("kind", "eval")
150
+ system = spec.get("system", "")
151
+ domain = spec.get("domain", "")
152
+ resp_cfg = spec.get("response") or {"mode": "none"}
153
+ mode = resp_cfg.get("mode", "none")
154
+
155
+ if mode == "provider" and provider is None:
156
+ raise SynthkitError("response.mode is 'provider' but no provider was selected "
157
+ "(pass --provider ollama|anthropic|openai)")
158
+
159
+ if dedup_embedder is not None:
160
+ return _generate_dedup(spec, n, provider, seed, dedup, progress,
161
+ dedup_embedder, dedup_threshold, stats)
162
+
163
+ prompts = sample_prompts(spec, n, seed=seed, dedup=dedup, max_attempts=max_attempts)
164
+ responses: List[str] = [""] * len(prompts)
165
+ if kind == "instruction" and mode != "none":
166
+ if mode == "rule":
167
+ responses = [_rule_response(resp_cfg, fill) for _, fill in prompts]
168
+ elif mode == "provider":
169
+ responses = pmap(lambda pf: provider.generate(pf[0], system),
170
+ prompts, concurrency=concurrency, progress=progress)
171
+ else:
172
+ raise SynthkitError(f"unknown response.mode {mode!r}")
173
+
174
+ return [_shape_record(kind, prompt, resp, system, domain)
175
+ for (prompt, _fill), resp in zip(prompts, responses)]
synthkit/text/seeds.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Built-in seed specs used by `synthkit text gen --demo`.
2
+
3
+ A seed spec is plain data (works as JSON or YAML on disk too):
4
+
5
+ kind "eval" (prompt-only) or "instruction" (instruction→output)
6
+ templates sentence templates with {slot} placeholders
7
+ slots lists of fillers, one per placeholder name
8
+ response how to fill the output for instruction data:
9
+ {"mode": "none"} leave blank (eval sets)
10
+ {"mode": "rule", "template": ...} fill a string template (offline)
11
+ {"mode": "provider"} call an LLM provider
12
+ constraints optional, e.g. {"min_words": 4}
13
+ """
14
+ from __future__ import annotations
15
+
16
+ _LANGS = ["Python", "JavaScript", "Rust", "Go", "TypeScript", "Java", "C++", "Ruby"]
17
+ _TASKS = [
18
+ "reverse a string",
19
+ "check whether a number is prime",
20
+ "merge two sorted lists",
21
+ "find the longest common subsequence of two strings",
22
+ "parse an ISO-8601 date",
23
+ "debounce a function",
24
+ "flatten a deeply nested list",
25
+ "compute a moving average over a stream",
26
+ "detect a cycle in a linked list",
27
+ "implement binary search",
28
+ ]
29
+
30
+ DEMO_EVAL = {
31
+ "kind": "eval",
32
+ "domain": "coding",
33
+ "templates": [
34
+ "Write a {language} function that {task}.",
35
+ "How would you {task} in {language}? Walk through your reasoning.",
36
+ "Review this {language} snippet that is meant to {task} and point out the bugs.",
37
+ "Explain to a beginner how to {task} using {language}.",
38
+ "What's the most efficient way to {task} in {language}, and why?",
39
+ "Refactor a {language} program that {task} to be more readable.",
40
+ ],
41
+ "slots": {"language": _LANGS, "task": _TASKS},
42
+ "constraints": {"min_words": 4},
43
+ "response": {"mode": "none"},
44
+ }
45
+
46
+ DEMO_INSTRUCTION = {
47
+ "kind": "instruction",
48
+ "domain": "coding",
49
+ "system": "You are a precise, helpful coding assistant.",
50
+ "templates": [
51
+ "Write a {language} function that {task}.",
52
+ "Show me how to {task} in {language}.",
53
+ "I need {language} code to {task}. Include a short explanation.",
54
+ ],
55
+ "slots": {"language": _LANGS, "task": _TASKS},
56
+ "constraints": {"min_words": 4},
57
+ "response": {
58
+ "mode": "rule",
59
+ "template": "Here's an approach in {language} to {task}: start by clarifying "
60
+ "the inputs and edge cases, then implement the core logic and test it.",
61
+ },
62
+ }
63
+
64
+ BUILTIN_SEEDS = {"eval": DEMO_EVAL, "instruction": DEMO_INSTRUCTION}
synthkit/util.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Small shared utilities."""
2
+ from __future__ import annotations
3
+
4
+ import math
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from typing import Callable, Iterable, List, Optional, Sequence, TypeVar
7
+
8
+ T = TypeVar("T")
9
+ R = TypeVar("R")
10
+
11
+
12
+ def pmap(fn: Callable[[T], R], items: Iterable[T], concurrency: int = 1,
13
+ progress: Optional[Callable[[int, int], None]] = None) -> List[R]:
14
+ """Map fn over items, optionally across threads, preserving input order.
15
+
16
+ Exceptions propagate (the first one raised wins). `progress(done, total)`
17
+ is called after each item completes.
18
+ """
19
+ items = list(items)
20
+ total = len(items)
21
+ results: List[Optional[R]] = [None] * total
22
+ if concurrency <= 1:
23
+ for i, it in enumerate(items):
24
+ results[i] = fn(it)
25
+ if progress:
26
+ progress(i + 1, total)
27
+ return results # type: ignore[return-value]
28
+
29
+ done = 0
30
+ with ThreadPoolExecutor(max_workers=concurrency) as ex:
31
+ futs = {ex.submit(fn, it): i for i, it in enumerate(items)}
32
+ for fut in as_completed(futs):
33
+ results[futs[fut]] = fut.result()
34
+ done += 1
35
+ if progress:
36
+ progress(done, total)
37
+ return results # type: ignore[return-value]
38
+
39
+
40
+ def unit(vec: Sequence[float]) -> List[float]:
41
+ """L2-normalize a vector (a zero vector maps to itself)."""
42
+ norm = math.sqrt(sum(x * x for x in vec)) or 1.0
43
+ return [x / norm for x in vec]
44
+
45
+
46
+ def max_cosine(u: Sequence[float], units: Sequence[Sequence[float]]) -> float:
47
+ """Max cosine similarity between unit vector u and a list of unit vectors."""
48
+ best = 0.0
49
+ for w in units:
50
+ s = 0.0
51
+ for a, b in zip(u, w):
52
+ s += a * b
53
+ if s > best:
54
+ best = s
55
+ return best