Spaces:

LaelaZ
/

synthkit

Sleeping

App Files Files Community

LaelaZ commited on 5 days ago

Commit

35ff0c7

verified ·

1 Parent(s): 3b33545

Sync package to GitHub source: em-dashes out of rendered output; no API/logic change

Browse files

Files changed (15) hide show

synthkit/__init__.py +13 -0
synthkit/__main__.py +5 -0
synthkit/cli.py +260 -0
synthkit/formats.py +54 -0
synthkit/grading.py +422 -0
synthkit/io_utils.py +68 -0
synthkit/models.py +61 -0
synthkit/privacy/__init__.py +6 -0
synthkit/providers.py +170 -0
synthkit/report.py +164 -0
synthkit/tabular/__init__.py +6 -0
synthkit/text/__init__.py +1 -0
synthkit/text/generate.py +175 -0
synthkit/text/seeds.py +64 -0
synthkit/util.py +55 -0

synthkit/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""synthkit: generate synthetic data and grade it for quality.
+Three products on one core:
+  • text     (A): instruction / eval datasets for training & evaluating LLMs   [live]
+  • tabular  (B): schema-aware fixtures with referential integrity            [roadmap]
+  • privacy  (C): privacy-safe synthetic twins of real datasets               [roadmap]
+The generator layer differs per product; the grading engine, providers, and
+report writers are shared.
+"""
+from __future__ import annotations
+__version__ = "0.4.0"

synthkit/__main__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""`python3 -m synthkit …`"""
+from synthkit.cli import main
+if __name__ == "__main__":
+    main()

synthkit/cli.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""Command-line interface for synthkit."""
+from __future__ import annotations
+import argparse
+import sys
+from typing import List
+from synthkit import __version__
+from synthkit.formats import FORMATS, to_format
+from synthkit.grading import grade_dataset
+from synthkit.io_utils import load_spec, read_records, write_jsonl, write_text
+from synthkit.models import SynthkitError
+from synthkit.providers import get_embedder, get_provider
+from synthkit.report import print_report, to_html, to_json
+from synthkit.text.generate import generate
+from synthkit.text.seeds import BUILTIN_SEEDS, DEMO_EVAL
+_GRADE_ORDER = ["F", "D", "C", "B", "A", "A+"]
+def _progress(done: int, total: int) -> None:
+    end = "\n" if done >= total else ""
+    print(f"\r  generated {done}/{total}", end=end, file=sys.stderr, flush=True)
+def _maybe_embedder(args):
+    if not getattr(args, "semantic", False):
+        return None
+    return get_embedder(getattr(args, "embed_provider", "ollama"),
+                        getattr(args, "embed_model", ""))
+# ---- text gen ----------------------------------------------------------------
+def _run_demo(args) -> int:
+    print("synthkit demo: generating a coding eval set, then grading it…",
+          file=sys.stderr)
+    # Train and eval are drawn from DISJOINT tasks (a genuine held-out split), then a
+    # known handful of eval records are deliberately leaked into train, so the
+    # contamination axis reflects real leakage, not one generator overlapping itself.
+    tasks = DEMO_EVAL["slots"]["task"]
+    train_spec = {**DEMO_EVAL, "slots": {**DEMO_EVAL["slots"], "task": tasks[:7]}}
+    eval_spec = {**DEMO_EVAL, "slots": {**DEMO_EVAL["slots"], "task": tasks[7:]}}
+    data = generate(train_spec, 195, seed=17)
+    bench = generate(eval_spec, 40, seed=99)
+    leaks = [dict(r) for r in bench[:5]]              # 5 genuine, verbatim leaks
+    data = data + leaks
+    write_jsonl("synthkit_demo.jsonl", data)
+    write_jsonl("synthkit_demo.benchmark.jsonl", bench)
+    report = grade_dataset(data, against=bench, ngram=args.ngram)
+    print_report(report, dataset="synthkit_demo.jsonl", use_color=not args.no_color)
+    write_text("synthkit_demo.report.json", to_json(report, "synthkit_demo.jsonl"))
+    write_text("synthkit_demo.report.html", to_html(report, "synthkit_demo.jsonl"))
+    print(f"  wrote synthkit_demo.jsonl ({len(data)} records) + benchmark ({len(bench)}) "
+          "+ .report.json + .report.html", file=sys.stderr)
+    print("  note: eval uses HELD-OUT tasks; 5 records were deliberately leaked into "
+          "train, so contamination flags exactly those real leaks.", file=sys.stderr)
+    return 0
+def cmd_text_gen(args) -> int:
+    if args.demo:
+        return _run_demo(args)
+    if not args.seed:
+        sys.exit("error: provide --seed FILE (a JSON/YAML seed spec) or use --demo")
+    spec = load_spec(args.seed)
+    if args.kind:
+        spec["kind"] = args.kind
+    provider = get_provider(args.provider, args.model)
+    dedup_embedder = get_embedder(args.embed_provider, args.embed_model) if args.dedup_semantic else None
+    busy = args.provider != "none" or args.dedup_semantic
+    show = _progress if (busy and not args.no_progress) else None
+    gstats: dict = {}
+    print(f"synthkit: generating {args.num} records from {args.seed}…", file=sys.stderr)
+    data = generate(spec, args.num, provider=provider, seed=args.seed_int,
+                    dedup=not args.no_dedup, concurrency=args.concurrency, progress=show,
+                    dedup_embedder=dedup_embedder, dedup_threshold=args.dedup_threshold,
+                    stats=gstats)
+    if args.dedup_semantic and gstats.get("rejected_semantic"):
+        print(f"  semantic dedup: rejected {gstats['rejected_semantic']} of "
+              f"{gstats['attempts']} candidates (cosine ≥ {args.dedup_threshold})",
+              file=sys.stderr)
+    if len(data) < args.num:
+        reason = ("raise --dedup-threshold or add slot variety" if args.dedup_semantic
+                  else "the seed's template×slot space is exhausted "
+                       "(add slot variety or pass --no-dedup)")
+        print(f"  note: produced {len(data)} of {args.num} requested, {reason}.",
+              file=sys.stderr)
+    out = args.out or "synth_text.jsonl"
+    write_jsonl(out, to_format(data, args.format))
+    print(f"  wrote {len(data)} records to {out}"
+          + (f"  ({args.format} format)" if args.format != "raw" else ""))
+    if not args.no_grade:
+        against = read_records(args.against) if args.against else None
+        report = grade_dataset(data, against=against, ngram=args.ngram,
+                               embedder=_maybe_embedder(args))
+        print_report(report, dataset=out, use_color=not args.no_color)
+        if args.report_json:
+            write_text(args.report_json, to_json(report, out))
+        if args.html:
+            write_text(args.html, to_html(report, out))
+    return 0
+# ---- grade -------------------------------------------------------------------
+def cmd_grade(args) -> int:
+    records = read_records(args.path)
+    if not records:
+        sys.exit(f"error: no records found in {args.path}")
+    against = read_records(args.against) if args.against else None
+    report = grade_dataset(records, fields=args.field or None,
+                           against=against, ngram=args.ngram,
+                           embedder=_maybe_embedder(args))
+    print_report(report, dataset=args.path, use_color=not args.no_color)
+    if args.json:
+        write_text(args.json, to_json(report, args.path))
+        print(f"  JSON written to {args.json}")
+    if args.html:
+        write_text(args.html, to_html(report, args.path))
+        print(f"  HTML written to {args.html}")
+    if args.min_grade:
+        if _GRADE_ORDER.index(report.grade) < _GRADE_ORDER.index(args.min_grade):
+            print(f"  grade {report.grade} is below --min-grade {args.min_grade}",
+                  file=sys.stderr)
+            return 1
+    return 0
+# ---- list / roadmap ----------------------------------------------------------
+def cmd_list(args) -> int:
+    print("\nsynthkit products\n")
+    print("  text      (A) live     LLM instruction & eval datasets + quality grading")
+    print("  tabular   (B) roadmap  schema-aware fixtures with referential integrity")
+    print("  privacy   (C) roadmap  privacy-safe synthetic twins of real datasets")
+    print("\n  built-in text seeds (use with: text gen --demo, or copy from examples/)\n")
+    for name, spec in BUILTIN_SEEDS.items():
+        t = len(spec["templates"])
+        slots = " × ".join(f"{len(v)} {k}" for k, v in spec["slots"].items())
+        print(f"    {name:<12} {spec['kind']:<12} {t} templates · {slots}")
+    print()
+    return 0
+def cmd_coming_soon(args) -> int:
+    print(f"\n  synthkit {args.product}: {args.blurb}")
+    print("  On the roadmap. Product A (`synthkit text`) is live today and B/C share")
+    print("  the same core: providers, the grading engine, and the report writer.\n")
+    return 0
+# ---- parser ------------------------------------------------------------------
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="synthkit",
+        description="Generate synthetic data and grade it for quality "
+                    "(validity, uniqueness, diversity, contamination).")
+    p.add_argument("--version", action="version", version=f"synthkit {__version__}")
+    sub = p.add_subparsers(dest="cmd")
+    # text (Product A) with its own subcommands
+    text_p = sub.add_parser("text", help="Product A: LLM instruction & eval datasets")
+    text_sub = text_p.add_subparsers(dest="text_cmd")
+    gen = text_sub.add_parser("gen", help="generate a text dataset and grade it")
+    gen.add_argument("--demo", action="store_true",
+                     help="generate + grade a built-in coding eval set (no setup)")
+    gen.add_argument("--seed", help="path to a JSON/YAML seed spec")
+    gen.add_argument("-n", "--num", type=int, default=200, help="records to generate")
+    gen.add_argument("--kind", choices=["eval", "instruction"],
+                     help="override the spec's kind")
+    gen.add_argument("--provider", default="none",
+                     choices=["none", "ollama", "anthropic", "openai"],
+                     help="response generator for instruction data (default: none)")
+    gen.add_argument("--model", default="", help="model name for the provider")
+    gen.add_argument("-o", "--out", help="output JSONL path (default: synth_text.jsonl)")
+    gen.add_argument("--seed-int", type=int, default=17, help="RNG seed (default: 17)")
+    gen.add_argument("--no-dedup", action="store_true",
+                     help="keep exact-duplicate prompts instead of skipping them")
+    gen.add_argument("--no-grade", action="store_true", help="skip grading the output")
+    gen.add_argument("--against", help="held-out eval set to check contamination against")
+    gen.add_argument("--ngram", type=int, default=8, help="contamination n-gram size")
+    gen.add_argument("--format", default="raw", choices=list(FORMATS),
+                     help="output schema: raw|alpaca|sharegpt|openai (default: raw)")
+    gen.add_argument("--concurrency", type=int, default=4,
+                     help="parallel provider calls when filling responses (default: 4)")
+    gen.add_argument("--no-progress", action="store_true",
+                     help="hide the response progress line")
+    gen.add_argument("--semantic", action="store_true",
+                     help="add an embedding-based semantic-dedup axis to grading")
+    gen.add_argument("--embed-provider", default="ollama", choices=["ollama", "openai"],
+                     help="embedder for --semantic (default: ollama)")
+    gen.add_argument("--embed-model", default="", help="embedding model name")
+    gen.add_argument("--dedup-semantic", action="store_true",
+                     help="reject semantically-similar records during generation "
+                          "(clean-by-construction; uses --embed-provider)")
+    gen.add_argument("--dedup-threshold", type=float, default=0.9,
+                     help="cosine ≥ this ⇒ reject as a semantic duplicate (default: 0.9)")
+    gen.add_argument("--report-json", help="write the grade report as JSON")
+    gen.add_argument("--html", help="write the grade report as HTML")
+    gen.add_argument("--no-color", action="store_true", help="disable ANSI colors")
+    gen.set_defaults(func=cmd_text_gen)
+    # grade (shared across all products)
+    gr = sub.add_parser("grade", help="grade any dataset (jsonl/json/csv) for quality")
+    gr.add_argument("path", help="dataset to grade")
+    gr.add_argument("--against", help="held-out eval set for the contamination check")
+    gr.add_argument("--field", action="append", default=[],
+                    help="field(s) to analyze (repeatable; default: auto-detect)")
+    gr.add_argument("--ngram", type=int, default=8, help="contamination n-gram size")
+    gr.add_argument("--min-grade", choices=_GRADE_ORDER,
+                    help="exit non-zero if the grade is below this (CI gate)")
+    gr.add_argument("--semantic", action="store_true",
+                    help="add an embedding-based semantic-dedup axis")
+    gr.add_argument("--embed-provider", default="ollama", choices=["ollama", "openai"],
+                    help="embedder for --semantic (default: ollama)")
+    gr.add_argument("--embed-model", default="", help="embedding model name")
+    gr.add_argument("--json", help="write the report as JSON")
+    gr.add_argument("--html", help="write the report as HTML")
+    gr.add_argument("--no-color", action="store_true", help="disable ANSI colors")
+    gr.set_defaults(func=cmd_grade)
+    # list
+    ls = sub.add_parser("list", help="list products and built-in seeds")
+    ls.set_defaults(func=cmd_list)
+    # roadmap stubs
+    tb = sub.add_parser("tabular", help="Product B: schema-aware fixtures [roadmap]")
+    tb.set_defaults(func=cmd_coming_soon, product="tabular",
+                    blurb="schema-aware fixtures with referential integrity")
+    pv = sub.add_parser("privacy", help="Product C: privacy-safe twins [roadmap]")
+    pv.set_defaults(func=cmd_coming_soon, product="privacy",
+                    blurb="privacy-safe synthetic twins of real datasets")
+    return p
+def main(argv: List[str] = None) -> None:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    if not getattr(args, "cmd", None):
+        parser.print_help()
+        sys.exit(0)
+    # bare `synthkit text` with no subcommand
+    if args.cmd == "text" and not getattr(args, "func", None):
+        parser.parse_args(["text", "--help"])
+    try:
+        sys.exit(args.func(args))
+    except SynthkitError as exc:
+        sys.exit(f"error: {exc}")
+if __name__ == "__main__":
+    main()

synthkit/formats.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Convert generated records into common fine-tuning dataset formats.
+  raw       what the generator emits (alpaca-ish: instruction/input/output, or prompt)
+  alpaca    {instruction, input, output}
+  sharegpt  {conversations: [{from: human, value}, {from: gpt, value}]}
+  openai    {messages: [{role: system?}, {role: user}, {role: assistant}]}
+Eval (prompt-only) records keep their prompt as the human/user turn with an
+empty completion, so the same dataset can drive evaluation or be completed later.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List
+from synthkit.models import SynthkitError
+FORMATS = ("raw", "alpaca", "sharegpt", "openai")
+def _parts(rec: Dict[str, Any]):
+    system = rec.get("system", "")
+    instruction = rec.get("instruction", rec.get("prompt", ""))
+    user_input = rec.get("input", "")
+    output = rec.get("output", rec.get("response", ""))
+    user = instruction if not user_input else f"{instruction}\n\n{user_input}"
+    return system, instruction, user_input, user, output
+def to_format(records: List[Dict[str, Any]], fmt: str) -> List[Dict[str, Any]]:
+    if fmt not in FORMATS:
+        raise SynthkitError(f"unknown --format {fmt!r} (choose from {', '.join(FORMATS)})")
+    if fmt == "raw":
+        return records
+    out: List[Dict[str, Any]] = []
+    for rec in records:
+        system, instruction, user_input, user, output = _parts(rec)
+        if fmt == "alpaca":
+            out.append({"instruction": instruction, "input": user_input, "output": output})
+        elif fmt == "sharegpt":
+            convo = []
+            if system:
+                convo.append({"from": "system", "value": system})
+            convo.append({"from": "human", "value": user})
+            convo.append({"from": "gpt", "value": output})
+            out.append({"conversations": convo})
+        elif fmt == "openai":
+            msgs = []
+            if system:
+                msgs.append({"role": "system", "content": system})
+            msgs.append({"role": "user", "content": user})
+            msgs.append({"role": "assistant", "content": output})
+            out.append({"messages": msgs})
+    return out

synthkit/grading.py ADDED Viewed

	@@ -0,0 +1,422 @@

+"""The grading engine, the part every synthkit product shares.
+Given a list of records, score the dataset on four axes:
+  validity       structurally sound records (required fields, non-empty, sane length)
+  uniqueness     free of exact and near-duplicate records (MinHash + LSH)
+  diversity      lexical variety across the set (distinct-n, self-similarity)
+  contamination  overlap with a held-out eval/benchmark set (n-gram containment)
+Everything here is standard-library only and deterministic for a fixed seed.
+"""
+from __future__ import annotations
+import hashlib
+import math
+import random
+import re
+from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
+from synthkit.models import DimensionScore, GradeReport, to_grade
+_MERSENNE = (1 << 61) - 1
+_WORD = re.compile(r"[a-z0-9]+")
+# Content fields we analyze by default (role/config keys like "system" excluded).
+_KNOWN_FIELDS = ("instruction", "input", "prompt", "question",
+                 "output", "response", "answer", "text")
+# ---- text helpers ------------------------------------------------------------
+def record_text(rec: Dict[str, Any], fields: Optional[Sequence[str]]) -> str:
+    """Flatten the fields we analyze into a single string."""
+    if fields:
+        vals = [rec.get(f, "") for f in fields]
+    else:
+        keys = [k for k in _KNOWN_FIELDS if k in rec]
+        if not keys:
+            keys = [k for k, v in rec.items() if isinstance(v, str)]
+        vals = [rec.get(k, "") for k in keys]
+    return "  ".join(str(v) for v in vals if v is not None)
+def tokens(text: str) -> List[str]:
+    return _WORD.findall(text.lower())
+def _stable_hash(s: str) -> int:
+    return int.from_bytes(hashlib.blake2b(s.encode("utf-8"), digest_size=8).digest(), "big")
+def shingles(toks: Sequence[str], k: int) -> Set[str]:
+    if not toks:
+        return set()
+    if len(toks) < k:
+        return {" ".join(toks)}
+    return {" ".join(toks[i:i + k]) for i in range(len(toks) - k + 1)}
+def ngram_set(toks: Sequence[str], n: int) -> Set[Tuple[str, ...]]:
+    """Word n-grams as tuples; items shorter than n contribute one whole tuple."""
+    if not toks:
+        return set()
+    if len(toks) <= n:
+        return {tuple(toks)}
+    return {tuple(toks[i:i + n]) for i in range(len(toks) - n + 1)}
+# ---- MinHash + LSH near-duplicate detection ----------------------------------
+class _MinHasher:
+    def __init__(self, num_perm: int, seed: int) -> None:
+        rng = random.Random(seed)
+        self.a = [rng.randrange(1, _MERSENNE) for _ in range(num_perm)]
+        self.b = [rng.randrange(0, _MERSENNE) for _ in range(num_perm)]
+    def sign(self, shs: Set[str]) -> Optional[Tuple[int, ...]]:
+        if not shs:
+            return None
+        base = [_stable_hash(s) for s in shs]
+        return tuple(min((a * h + b) % _MERSENNE for h in base)
+                     for a, b in zip(self.a, self.b))
+class _UnionFind:
+    def __init__(self, n: int) -> None:
+        self.parent = list(range(n))
+    def find(self, x: int) -> int:
+        while self.parent[x] != x:
+            self.parent[x] = self.parent[self.parent[x]]
+            x = self.parent[x]
+        return x
+    def union(self, x: int, y: int) -> None:
+        rx, ry = self.find(x), self.find(y)
+        if rx == ry:
+            return
+        # keep the smaller index as root (the "original" of the cluster)
+        if rx < ry:
+            self.parent[ry] = rx
+        else:
+            self.parent[rx] = ry
+class _LSHIndex:
+    """MinHash + LSH index over a list of shingle sets.
+    Build once, then `candidates(shingles)` returns the (small) set of indices that
+    share at least one band with the query, turning all-pairs similarity work into
+    near-linear candidate lookups. Used for both near-dup detection and the
+    contamination check so neither is quadratic in the dataset size.
+    """
+    def __init__(self, shingle_sets: List[Set[str]], *, num_perm: int = 64,
+                 bands: int = 16, seed: int = 17) -> None:
+        self._hasher = _MinHasher(num_perm, seed)
+        self._bands = bands
+        self._rows = num_perm // bands
+        self._buckets: Dict[Tuple[int, Tuple[int, ...]], List[int]] = {}
+        for idx, shingset in enumerate(shingle_sets):
+            sig = self._hasher.sign(shingset)
+            if sig is None:
+                continue
+            for band in range(bands):
+                key = (band, sig[band * self._rows:(band + 1) * self._rows])
+                self._buckets.setdefault(key, []).append(idx)
+    def candidates(self, shingset: Set[str]) -> Set[int]:
+        sig = self._hasher.sign(shingset)
+        if sig is None:
+            return set()
+        out: Set[int] = set()
+        for band in range(self._bands):
+            key = (band, sig[band * self._rows:(band + 1) * self._rows])
+            out.update(self._buckets.get(key, ()))
+        return out
+def _duplicate_map(shingle_sets: List[Set[str]], *, threshold: float = 0.8,
+                   seed: int = 17) -> Dict[int, int]:
+    """Return {pos: root_pos} for every entry that near-duplicates an earlier one."""
+    index = _LSHIndex(shingle_sets, seed=seed)
+    uf = _UnionFind(len(shingle_sets))
+    for i, shingset in enumerate(shingle_sets):
+        if not shingset:
+            continue
+        for j in index.candidates(shingset):
+            if j >= i:
+                continue
+            other = shingle_sets[j]
+            if other and len(shingset & other) / len(shingset | other) >= threshold:
+                uf.union(i, j)
+    dup_of: Dict[int, int] = {}
+    for idx in range(len(shingle_sets)):
+        root = uf.find(idx)
+        if root != idx:
+            dup_of[idx] = root
+    return dup_of
+# ---- the four dimensions -----------------------------------------------------
+def _validity_dim(records, texts, *, min_words, max_words) -> DimensionScore:
+    n = len(records)
+    empty = short = long = 0
+    examples: List[str] = []
+    for text in texts:
+        wc = len(tokens(text))
+        if not text.strip():
+            empty += 1
+            if len(examples) < 3:
+                examples.append("empty record")
+        elif wc < min_words:
+            short += 1
+            if len(examples) < 3:
+                examples.append(f"only {wc} words: {text[:60]!r}")
+        elif wc > max_words:
+            long += 1
+            if len(examples) < 3:
+                examples.append(f"{wc} words (over {max_words})")
+    bad = empty + short + long
+    score = 100.0 * (1 - bad / n) if n else 0.0
+    findings: List[str] = []
+    if empty:
+        findings.append(f"{empty} empty record(s)")
+    if short:
+        findings.append(f"{short} below {min_words} words")
+    if long:
+        findings.append(f"{long} above {max_words} words")
+    findings += examples
+    return DimensionScore(
+        "validity", "Validity", round(score, 1),
+        f"{n - bad}/{n} records well-formed", findings,
+        {"empty": empty, "too_short": short, "too_long": long, "n": n}, weight=0.25)
+def _uniqueness_dim(records, texts, shingle_sets, *, seed) -> DimensionScore:
+    n = len(records)
+    seen: Dict[str, int] = {}
+    exact_dup: Set[int] = set()
+    rep_indices: List[int] = []
+    for idx, text in enumerate(texts):
+        norm = " ".join(tokens(text))
+        if norm and norm in seen:
+            exact_dup.add(idx)
+        else:
+            if norm:
+                seen[norm] = idx
+            rep_indices.append(idx)
+    # near-dup search runs only over exact-unique representatives (keeps it cheap)
+    rep_shingles = [shingle_sets[i] for i in rep_indices]
+    local_dup = _duplicate_map(rep_shingles, seed=seed)
+    near_dup: Set[int] = set()
+    clusters: Set[int] = set()
+    example = ""
+    for local_idx, local_root in local_dup.items():
+        gi, gr = rep_indices[local_idx], rep_indices[local_root]
+        near_dup.add(gi)
+        clusters.add(gr)
+        if not example:
+            example = f"e.g. #{gi} ≈ #{gr}: {texts[gi][:64]!r}"
+    dup_total = len(exact_dup) + len(near_dup)
+    score = 100.0 * (1 - dup_total / n) if n else 0.0
+    findings: List[str] = []
+    if exact_dup:
+        findings.append(f"{len(exact_dup)} exact duplicate(s)")
+    if near_dup:
+        findings.append(f"{len(near_dup)} near-duplicate(s) in {len(clusters)} cluster(s)")
+    if example:
+        findings.append(example)
+    return DimensionScore(
+        "uniqueness", "Uniqueness", round(score, 1),
+        f"{n - dup_total}/{n} unique  ({len(exact_dup)} exact, {len(near_dup)} near)",
+        findings,
+        {"exact": len(exact_dup), "near": len(near_dup),
+         "clusters": len(clusters), "n": n}, weight=0.30)
+def _diversity_dim(texts, shingle_sets, *, seed) -> DimensionScore:
+    unigrams: List[str] = []
+    bigrams: List[Tuple[str, str]] = []
+    for text in texts:
+        ts = tokens(text)
+        unigrams.extend(ts)
+        bigrams.extend(zip(ts, ts[1:]))
+    d1 = len(set(unigrams)) / len(unigrams) if unigrams else 0.0
+    d2 = len(set(bigrams)) / len(bigrams) if bigrams else 0.0
+    # self-similarity: mean Jaccard over a seeded sample of record pairs
+    rng = random.Random(seed)
+    idxs = [i for i, s in enumerate(shingle_sets) if s]
+    sims: List[float] = []
+    if len(idxs) >= 2:
+        for _ in range(min(2000, len(idxs) * 4)):
+            i, j = rng.sample(idxs, 2)
+            a, b = shingle_sets[i], shingle_sets[j]
+            sims.append(len(a & b) / len(a | b))
+    self_sim = sum(sims) / len(sims) if sims else 0.0
+    # Pairwise distinctness (1 - self-similarity) is the size-stable signal and
+    # leads; distinct-2 is a secondary lexical-variety term with a lenient target
+    # (corpus-level distinct-n shrinks as the set grows). Raw numbers are reported
+    # in stats either way so the letter is never the whole story.
+    score = 100.0 * (0.6 * (1 - self_sim) + 0.4 * min(1, d2 / 0.25))
+    findings: List[str] = []
+    if d2 < 0.4:
+        findings.append("low bigram diversity, templates may be too repetitive")
+    if self_sim > 0.3:
+        findings.append(f"records are {self_sim * 100:.0f}% similar on average")
+    return DimensionScore(
+        "diversity", "Diversity", round(score, 1),
+        f"distinct-2 {d2:.2f} · distinct-1 {d1:.2f} · self-sim {self_sim:.2f}",
+        findings,
+        {"distinct_1": round(d1, 4), "distinct_2": round(d2, 4),
+         "self_similarity": round(self_sim, 4), "vocab": len(set(unigrams))},
+        weight=0.25)
+def _contamination_dim(texts, shingle_sets, against_texts, *, ngram) -> DimensionScore:
+    if against_texts is None:
+        return DimensionScore(
+            "contamination", "Contamination", None,
+            "no eval set provided (pass --against to check)", [], {}, weight=0.20)
+    eval_ngrams: Set[Tuple[str, ...]] = set()
+    eval_shingles: List[Set[str]] = []
+    for t in against_texts:
+        ts = tokens(t)
+        eval_ngrams |= ngram_set(ts, ngram)
+        eval_shingles.append(shingles(ts, 5))
+    eval_index = _LSHIndex(eval_shingles, seed=17)     # avoid the O(records×eval) scan
+    flagged: List[Tuple[int, str, str]] = []
+    for idx, text in enumerate(texts):
+        ts = tokens(text)
+        sh = shingle_sets[idx]
+        hit, reason = False, ""
+        if sh:                                         # near-duplicate of an eval item
+            for j in eval_index.candidates(sh):
+                es = eval_shingles[j]
+                if es and len(sh & es) / len(sh | es) >= 0.7:
+                    hit, reason = True, "near-duplicate of an eval item"
+                    break
+        if not hit:
+            # n-gram containment: what fraction of THIS record's n-grams are in the
+            # eval set. Robust to shared template boilerplate (only a few n-grams),
+            # which a raw "shares any n-gram" check would over-flag.
+            grams = ngram_set(ts, ngram)
+            if grams:
+                contained = sum(1 for g in grams if g in eval_ngrams) / len(grams)
+                if contained >= 0.8:
+                    hit = True
+                    reason = f"{contained * 100:.0f}% of its {ngram}-grams are in the eval set"
+        if hit:
+            flagged.append((idx, reason, text[:64]))
+    n = len(texts)
+    score = 100.0 * (1 - len(flagged) / n) if n else 100.0
+    summary = (f"{len(flagged)}/{n} records overlap the eval set" if flagged
+               else f"clean: 0/{n} overlap the eval set")
+    findings = [f"#{i}: {why}: {snip!r}" for i, why, snip in flagged[:4]]
+    return DimensionScore(
+        "contamination", "Contamination", round(score, 1), summary, findings,
+        {"flagged": len(flagged), "n": n, "ngram": ngram}, weight=0.20)
+# ---- optional semantic axis (embedding-based) --------------------------------
+def _unit(v: Sequence[float]) -> List[float]:
+    norm = math.sqrt(sum(x * x for x in v)) or 1.0
+    return [x / norm for x in v]
+def _cos_unit(a: Sequence[float], b: Sequence[float]) -> float:
+    return sum(x * y for x, y in zip(a, b))
+def _semantic_dups(units: List[List[float]], threshold: float) -> Set[int]:
+    n = len(units)
+    dup: Set[int] = set()
+    try:
+        import numpy as np  # fast path if available
+        sims = np.asarray(units) @ np.asarray(units).T
+        for a in range(n):
+            if a in dup:
+                continue
+            row = sims[a]
+            for b in range(a + 1, n):
+                if b not in dup and row[b] >= threshold:
+                    dup.add(b)
+    except ImportError:
+        for a in range(n):
+            if a in dup:
+                continue
+            ua = units[a]
+            for b in range(a + 1, n):
+                if b not in dup and _cos_unit(ua, units[b]) >= threshold:
+                    dup.add(b)
+    return dup
+def _semantic_dim(texts, embedder, *, threshold, seed, max_n=400) -> DimensionScore:
+    idx = [i for i, t in enumerate(texts) if t.strip()]
+    note = ""
+    if len(idx) > max_n:
+        idx = sorted(random.Random(seed).sample(idx, max_n))
+        note = f" (sampled {max_n})"
+    units = [_unit(v) for v in embedder.embed([texts[i] for i in idx])]
+    dup = _semantic_dups(units, threshold)
+    rng = random.Random(seed)
+    sims = []
+    if len(units) >= 2:
+        for _ in range(min(2000, len(units) * 4)):
+            a, b = rng.sample(range(len(units)), 2)
+            sims.append(_cos_unit(units[a], units[b]))
+    mean_sim = sum(sims) / len(sims) if sims else 0.0
+    n = len(idx)
+    score = 100.0 * (1 - len(dup) / n) if n else 100.0
+    findings = []
+    if dup:
+        findings.append(f"{len(dup)} semantic near-duplicate(s) at cosine ≥ {threshold} "
+                        "(paraphrases lexical dedup misses)")
+    summary = f"{n - len(dup)}/{n} semantically distinct · mean cosine {mean_sim:.2f}{note}"
+    return DimensionScore(
+        "semantic", "Semantic dedup", round(score, 1), summary, findings,
+        {"semantic_dups": len(dup), "mean_cosine": round(mean_sim, 4),
+         "n": n, "threshold": threshold}, weight=0.20)
+# ---- public API --------------------------------------------------------------
+def grade_dataset(records: List[Dict[str, Any]], *,
+                  fields: Optional[Sequence[str]] = None,
+                  against: Optional[List[Dict[str, Any]]] = None,
+                  against_fields: Optional[Sequence[str]] = None,
+                  ngram: int = 8, min_words: int = 3, max_words: int = 512,
+                  seed: int = 17, embedder=None,
+                  semantic_threshold: float = 0.83) -> GradeReport:
+    if not records:
+        return GradeReport(grade="F", score=0.0, n_records=0, dimensions=[],
+                           meta={"note": "empty dataset"})
+    texts = [record_text(r, fields) for r in records]
+    shingle_sets = [shingles(tokens(t), 5) for t in texts]
+    against_texts = None
+    if against is not None:
+        against_texts = [record_text(r, against_fields or fields) for r in against]
+    dims = [
+        _validity_dim(records, texts, min_words=min_words, max_words=max_words),
+        _uniqueness_dim(records, texts, shingle_sets, seed=seed),
+        _diversity_dim(texts, shingle_sets, seed=seed),
+        _contamination_dim(texts, shingle_sets, against_texts, ngram=ngram),
+    ]
+    if embedder is not None:
+        dims.append(_semantic_dim(texts, embedder, threshold=semantic_threshold, seed=seed))
+    applicable = [d for d in dims if d.applicable]
+    wsum = sum(d.weight for d in applicable) or 1.0
+    overall = sum(d.score * d.weight for d in applicable) / wsum
+    return GradeReport(
+        grade=to_grade(overall), score=round(overall, 1), n_records=len(records),
+        dimensions=dims,
+        meta={"fields": list(fields) if fields else "auto",
+              "has_eval": against_texts is not None,
+              "semantic": embedder is not None})

synthkit/io_utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""Read and write datasets as JSONL, JSON, or CSV, standard library only."""
+from __future__ import annotations
+import csv
+import json
+import os
+from typing import Any, Dict, List
+from synthkit.models import SynthkitError
+def _ext(path: str) -> str:
+    return os.path.splitext(path)[1].lower()
+def read_records(path: str) -> List[Dict[str, Any]]:
+    """Load records from .jsonl / .json / .csv / .tsv."""
+    ext = _ext(path)
+    with open(path, "r", encoding="utf-8") as fh:
+        if ext in (".jsonl", ".ndjson"):
+            return [json.loads(line) for line in fh if line.strip()]
+        if ext == ".json":
+            data = json.load(fh)
+            if isinstance(data, dict):
+                # common wrappers: {"data": [...]} / {"records": [...]}
+                for key in ("data", "records", "rows", "examples"):
+                    if isinstance(data.get(key), list):
+                        return data[key]
+                return [data]
+            return list(data)
+        if ext in (".csv", ".tsv"):
+            delim = "\t" if ext == ".tsv" else ","
+            return list(csv.DictReader(fh, delimiter=delim))
+    raise SynthkitError(f"unsupported input format {ext or path!r}")
+def write_jsonl(path: str, records: List[Dict[str, Any]]) -> None:
+    _ensure_dir(path)
+    with open(path, "w", encoding="utf-8") as fh:
+        for r in records:
+            fh.write(json.dumps(r, ensure_ascii=False) + "\n")
+def write_text(path: str, text: str) -> None:
+    _ensure_dir(path)
+    with open(path, "w", encoding="utf-8") as fh:
+        fh.write(text)
+def load_spec(path: str) -> Dict[str, Any]:
+    """Load a seed spec from JSON (always) or YAML (if pyyaml is installed)."""
+    ext = _ext(path)
+    with open(path, "r", encoding="utf-8") as fh:
+        if ext in (".yaml", ".yml"):
+            try:
+                import yaml  # optional dependency
+            except ImportError as exc:  # pragma: no cover
+                raise SynthkitError(
+                    "reading YAML seeds needs pyyaml. "
+                    "`pip install pyyaml`, or use a .json seed."
+                ) from exc
+            return yaml.safe_load(fh)
+        return json.load(fh)
+def _ensure_dir(path: str) -> None:
+    d = os.path.dirname(os.path.abspath(path))
+    os.makedirs(d, exist_ok=True)

synthkit/models.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""Core data types shared across every synthkit product."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+class SynthkitError(Exception):
+    """User-facing error (bad seed spec, unreadable input, provider failure).
+    Library code raises this instead of calling sys.exit/SystemExit, so that
+    callers embedding the library (the Gradio app, the tests) can catch it.
+    The CLI converts it into a clean non-zero exit.
+    """
+# Letter grades, best to worst, same scale as the rest of the portfolio.
+GRADE_BANDS = [
+    (97, "A+"), (93, "A"), (85, "B"), (75, "C"), (65, "D"), (0, "F"),
+]
+def to_grade(score: float) -> str:
+    for cutoff, letter in GRADE_BANDS:
+        if score >= cutoff:
+            return letter
+    return "F"
+@dataclass
+class DimensionScore:
+    """One quality axis (validity, uniqueness, diversity, contamination)."""
+    key: str
+    title: str
+    score: Optional[float]                                # 0–100, or None when N/A
+    summary: str = ""
+    findings: List[str] = field(default_factory=list)     # human-readable notes
+    stats: Dict[str, Any] = field(default_factory=dict)   # raw numbers
+    weight: float = 1.0
+    @property
+    def applicable(self) -> bool:
+        return self.score is not None
+@dataclass
+class GradeReport:
+    """The graded result for a dataset."""
+    grade: str
+    score: float                                          # 0–100 overall
+    n_records: int
+    dimensions: List[DimensionScore] = field(default_factory=list)
+    meta: Dict[str, Any] = field(default_factory=dict)
+    def dim(self, key: str) -> Optional[DimensionScore]:
+        for d in self.dimensions:
+            if d.key == key:
+                return d
+        return None

synthkit/privacy/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Product C: privacy-safe synthetic twins of real datasets. [roadmap]
+Fits a generator to a real dataset, emits a statistically similar synthetic
+copy, and adds privacy/utility dimensions (membership-inference distance,
+column-distribution fidelity) to the shared grading report.
+"""

synthkit/providers.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""Response + embedding providers.
+A *provider* turns a prompt into a response (for instruction→output pairs); an
+*embedder* turns text into a vector (for the optional semantic-quality axis).
+The 'none' path is stdlib; ollama is local & free; anthropic/openai are lazy
+imports used only if selected.
+"""
+from __future__ import annotations
+import json
+import os
+import urllib.error
+import urllib.request
+from typing import List, Optional
+from synthkit.models import SynthkitError
+# ---- HTTP helper with friendly Ollama errors ---------------------------------
+def _post_json(url: str, payload: dict, timeout: int = 120) -> dict:
+    body = json.dumps(payload).encode("utf-8")
+    req = urllib.request.Request(url, data=body,
+                                 headers={"Content-Type": "application/json"})
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read().decode("utf-8"))
+    except urllib.error.HTTPError as exc:
+        detail = exc.read().decode("utf-8", "ignore")[:200]
+        hint = ""
+        if exc.code == 404 and "model" in detail.lower():
+            hint = ". Pull it first with `ollama pull <model>`"
+        raise SynthkitError(f"Ollama returned HTTP {exc.code} from {url}{hint}\n  {detail}")
+    except urllib.error.URLError as exc:
+        raise SynthkitError(
+            f"can't reach Ollama at {url} ({exc.reason}). "
+            "Is the daemon running? Start it with `ollama serve`.")
+# ---- response providers ------------------------------------------------------
+class Provider:
+    name = "base"
+    def generate(self, prompt: str, system: str = "") -> str:
+        raise NotImplementedError
+class OllamaProvider(Provider):
+    """Local, free responses via a running Ollama daemon."""
+    name = "ollama"
+    def __init__(self, model: str = "llama3.2", host: str = "") -> None:
+        self.model = model
+        self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/")
+    def generate(self, prompt: str, system: str = "") -> str:
+        data = _post_json(self.host + "/api/generate", {
+            "model": self.model, "prompt": prompt,
+            "system": system, "stream": False,
+        })
+        return (data.get("response") or "").strip()
+class AnthropicProvider(Provider):
+    name = "anthropic"
+    def __init__(self, model: str = "claude-haiku-4-5-20251001") -> None:
+        try:
+            import anthropic
+        except ImportError as exc:
+            raise SynthkitError("--provider anthropic needs the anthropic SDK. "
+                             "`pip install anthropic`.") from exc
+        self.model = model
+        self._client = anthropic.Anthropic()
+    def generate(self, prompt: str, system: str = "") -> str:
+        msg = self._client.messages.create(
+            model=self.model, max_tokens=1024,
+            system=system or "You are a helpful assistant.",
+            messages=[{"role": "user", "content": prompt}])
+        return "".join(b.text for b in msg.content
+                       if getattr(b, "type", "") == "text").strip()
+class OpenAIProvider(Provider):
+    name = "openai"
+    def __init__(self, model: str = "gpt-4o-mini") -> None:
+        try:
+            import openai
+        except ImportError as exc:
+            raise SynthkitError("--provider openai needs the openai SDK. "
+                             "`pip install openai`.") from exc
+        self.model = model
+        self._client = openai.OpenAI()
+    def generate(self, prompt: str, system: str = "") -> str:
+        resp = self._client.chat.completions.create(
+            model=self.model,
+            messages=[{"role": "system", "content": system or "You are a helpful assistant."},
+                      {"role": "user", "content": prompt}])
+        return (resp.choices[0].message.content or "").strip()
+def get_provider(name: Optional[str], model: str = "") -> Optional[Provider]:
+    if not name or name == "none":
+        return None
+    if name == "ollama":
+        return OllamaProvider(model or "llama3.2")
+    if name == "anthropic":
+        return AnthropicProvider(model or "claude-haiku-4-5-20251001")
+    if name == "openai":
+        return OpenAIProvider(model or "gpt-4o-mini")
+    raise SynthkitError(f"unknown provider {name!r}")
+# ---- embedders (for the optional semantic axis) ------------------------------
+class Embedder:
+    name = "base"
+    def embed(self, texts: List[str]) -> List[List[float]]:
+        raise NotImplementedError
+class OllamaEmbedder(Embedder):
+    name = "ollama"
+    def __init__(self, model: str = "nomic-embed-text", host: str = "") -> None:
+        self.model = model
+        self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/")
+    def embed(self, texts: List[str]) -> List[List[float]]:
+        out: List[List[float]] = []
+        for t in texts:
+            data = _post_json(self.host + "/api/embeddings",
+                              {"model": self.model, "prompt": t})
+            vec = data.get("embedding")
+            if not vec:
+                raise SynthkitError(f"Ollama embedder returned no vector for model {self.model!r}")
+            out.append(vec)
+        return out
+class OpenAIEmbedder(Embedder):
+    name = "openai"
+    def __init__(self, model: str = "text-embedding-3-small") -> None:
+        try:
+            import openai
+        except ImportError as exc:
+            raise SynthkitError("--embed-provider openai needs the openai SDK. "
+                             "`pip install openai`.") from exc
+        self.model = model
+        self._client = openai.OpenAI()
+    def embed(self, texts: List[str]) -> List[List[float]]:
+        resp = self._client.embeddings.create(model=self.model, input=texts)
+        return [d.embedding for d in resp.data]
+def get_embedder(name: Optional[str], model: str = "") -> Optional[Embedder]:
+    if not name or name == "none":
+        return None
+    if name == "ollama":
+        return OllamaEmbedder(model or "nomic-embed-text")
+    if name == "openai":
+        return OpenAIEmbedder(model or "text-embedding-3-small")
+    raise SynthkitError(f"unknown embed provider {name!r}")

synthkit/report.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""Render a GradeReport: terminal, JSON, and a standalone HTML report."""
+from __future__ import annotations
+import html
+import json
+from typing import List
+from synthkit import __version__
+from synthkit.models import GradeReport, to_grade
+# ---- ANSI helpers ------------------------------------------------------------
+_C = {
+    "reset": "\033[0m", "bold": "\033[1m", "dim": "\033[2m",
+    "red": "\033[31m", "green": "\033[32m", "yellow": "\033[33m",
+    "blue": "\033[34m", "magenta": "\033[35m", "cyan": "\033[36m",
+}
+GRADE_COLOR = {"A+": "green", "A": "green", "B": "cyan", "C": "yellow", "D": "yellow", "F": "red"}
+GRADE_HEX = {"A+": "#16a34a", "A": "#16a34a", "B": "#0891b2", "C": "#ca8a04",
+             "D": "#ea580c", "F": "#dc2626"}
+def _c(text: str, color: str) -> str:
+    return f"{_C.get(color, '')}{text}{_C['reset']}"
+def _bar(score: float, width: int = 21) -> str:
+    fill = int(round(score / 100 * width))
+    return "█" * fill + "░" * (width - fill)
+def _score_color(score: float) -> str:
+    return GRADE_COLOR.get(to_grade(score), "yellow")
+def _hex(score: float) -> str:
+    return GRADE_HEX.get(to_grade(score), "#ca8a04")
+def print_report(report: GradeReport, dataset: str = "", use_color: bool = True) -> None:
+    def col(t, c):
+        return _c(t, c) if use_color else t
+    g = report.grade
+    print()
+    print(col("  ┌─ synthkit · data quality ──────────────────────────", "dim"))
+    print("  │")
+    print(f"  │  Quality grade   {col(g, GRADE_COLOR.get(g, 'yellow'))}    ({report.score}/100)")
+    print(f"  │  Records         {report.n_records}")
+    if dataset:
+        print(f"  │  Dataset         {dataset}")
+    print("  │")
+    print(col("  └────────────────────────────────────────────────────", "dim"))
+    print(f"\n  {col('DIMENSIONS', 'bold')}\n")
+    for d in report.dimensions:
+        if d.applicable:
+            mark = col("●", _score_color(d.score))
+            bar = col(_bar(d.score), _score_color(d.score))
+            print(f"  {mark} {d.title:<14}{d.score:>5.0f}  {bar}  {col(d.summary, 'dim')}")
+        else:
+            print(f"  {col('○', 'dim')} {d.title:<14}{col('  n/a', 'dim')}  {col(d.summary, 'dim')}")
+    notes = [(d.title, f) for d in report.dimensions for f in d.findings]
+    if notes:
+        print(f"\n  {col('NOTES', 'bold')}\n")
+        for title, f in notes:
+            print(f"  {col('▸ ' + title + ':', 'cyan')} {f}")
+    print(f"\n  {col('Grade = weighted blend of the applicable axes · tune with --against / --field', 'dim')}")
+    print()
+def to_json(report: GradeReport, dataset: str = "") -> str:
+    return json.dumps({
+        "dataset": dataset,
+        "grade": report.grade,
+        "score": report.score,
+        "records": report.n_records,
+        "dimensions": [
+            {"key": d.key, "title": d.title, "score": d.score,
+             "summary": d.summary, "findings": d.findings, "stats": d.stats}
+            for d in report.dimensions
+        ],
+        "meta": report.meta,
+    }, indent=2)
+def to_html(report: GradeReport, dataset: str = "") -> str:
+    gcolor = GRADE_HEX.get(report.grade, "#ca8a04")
+    rows: List[str] = []
+    for d in report.dimensions:
+        if d.applicable:
+            bc = _hex(d.score)
+            rows.append(f"""
+        <div class="dim">
+          <div class="dim-head">
+            <span class="dt">{html.escape(d.title)}</span>
+            <span class="ds" style="color:{bc}">{d.score:.0f}</span>
+          </div>
+          <div class="track"><div class="fill" style="width:{d.score:.0f}%;background:{bc}"></div></div>
+          <div class="dsum">{html.escape(d.summary)}</div>
+        </div>""")
+        else:
+            rows.append(f"""
+        <div class="dim">
+          <div class="dim-head">
+            <span class="dt">{html.escape(d.title)}</span>
+            <span class="ds na">n/a</span>
+          </div>
+          <div class="dsum">{html.escape(d.summary)}</div>
+        </div>""")
+    notes = [f'<li><b>{html.escape(d.title)}:</b> {html.escape(f)}</li>'
+             for d in report.dimensions for f in d.findings]
+    return f"""<!doctype html><html lang="en"><head><meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>synthkit report</title>
+<style>
+  :root {{ color-scheme: light dark; }}
+  body {{ font: 15px/1.55 -apple-system, Segoe UI, Roboto, sans-serif; margin: 0;
+          background: #0b1020; color: #e5e7eb; }}
+  .wrap {{ max-width: 760px; margin: 0 auto; padding: 40px 24px 80px; }}
+  h1 {{ font-size: 20px; letter-spacing: .3px; margin: 0 0 4px; }}
+  .sub {{ color: #94a3b8; margin: 0 0 28px; font-size: 13px; }}
+  .hero {{ display: flex; gap: 24px; align-items: center; background: #111934;
+           border: 1px solid #1e293b; border-radius: 14px; padding: 24px; margin-bottom: 28px; }}
+  .grade {{ font-size: 56px; font-weight: 800; line-height: 1; color: {gcolor}; }}
+  .meta {{ flex: 1; }}
+  .meta .big {{ font-size: 15px; margin-bottom: 6px; }}
+  .meta .small {{ color: #94a3b8; font-size: 13px; }}
+  h2 {{ font-size: 13px; text-transform: uppercase; letter-spacing: 1px; color: #94a3b8;
+        margin: 32px 0 14px; }}
+  .dim {{ background: #111934; border: 1px solid #1e293b; border-radius: 12px;
+          padding: 14px 18px; margin-bottom: 10px; }}
+  .dim-head {{ display: flex; justify-content: space-between; align-items: baseline; }}
+  .dt {{ font-weight: 700; }}
+  .ds {{ font-size: 22px; font-weight: 800; }}
+  .ds.na {{ color: #64748b; font-size: 15px; font-weight: 600; }}
+  .track {{ height: 8px; background: #0b1020; border-radius: 999px; margin: 10px 0 8px; overflow: hidden; }}
+  .fill {{ height: 100%; border-radius: 999px; }}
+  .dsum {{ color: #cbd5e1; font-size: 13px; }}
+  ul.notes {{ list-style: none; padding: 0; margin: 0; }}
+  ul.notes li {{ background: #0e1830; border: 1px solid #1e293b; border-left: 3px solid #38bdf8;
+                 border-radius: 8px; padding: 9px 14px; margin-bottom: 8px; font-size: 13.5px; }}
+  ul.notes b {{ color: #818cf8; }}
+  .foot {{ margin-top: 36px; color: #64748b; font-size: 12px; }}
+</style></head><body><div class="wrap">
+  <h1>synthkit: synthetic data quality report</h1>
+  <p class="sub">validity · uniqueness · diversity · contamination</p>
+  <div class="hero">
+    <div class="grade">{report.grade}</div>
+    <div class="meta">
+      <div class="big">Quality score <b>{report.score}/100</b></div>
+      <div class="small">{report.n_records} records{(' · ' + html.escape(dataset)) if dataset else ''}</div>
+    </div>
+  </div>
+  <h2>Dimensions</h2>
+  {''.join(rows)}
+  <h2>Notes</h2>
+  {('<ul class="notes">' + ''.join(notes) + '</ul>') if notes else '<p style="color:#86efac">Nothing flagged.</p>'}
+  <p class="foot">Generated by synthkit v{__version__} · grade is a weighted blend of the applicable axes.</p>
+</div></body></html>"""

synthkit/tabular/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Product B: schema-aware tabular fixtures with referential integrity. [roadmap]
+Slots into the same core as Product A: it will emit records that the shared
+grading engine (validity / uniqueness / diversity, plus tabular-specific
+referential-integrity and PII-safety checks) scores with the same report.
+"""

synthkit/text/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Product A: synthetic instruction & evaluation datasets for LLMs."""

synthkit/text/generate.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""Generate text records (eval prompts or instruction→output pairs) from a seed spec.
+Two phases:
+  1. sample prompts: deterministic for a fixed seed; exact-duplicate prompts are
+     skipped by default so naive slot collisions don't pad the dataset.
+  2. fill responses: only for instruction data with response.mode == 'provider';
+     runs concurrently with a progress callback.
+Optional: pass a `dedup_embedder` to dedup *by meaning as you generate*, each
+candidate is embedded and rejected if it's within `dedup_threshold` cosine of an
+already-accepted record.
+Templates are rendered with a safe `{slot}`-only substitution (NOT str.format):
+attribute/index access and format specs are treated as literal text, so an
+untrusted template can't reach object internals or trigger a format-spec blow-up.
+"""
+from __future__ import annotations
+import random
+import re
+from typing import Any, Callable, Dict, List, Optional, Tuple
+from synthkit.grading import record_text
+from synthkit.models import SynthkitError
+from synthkit.providers import Embedder, Provider
+from synthkit.util import max_cosine, pmap, unit
+_PLACEHOLDER = re.compile(r"\{(\w+)\}")
+_MAX_RECORD_CHARS = 100_000  # guards against a pathological slot value
+def render(template: str, fill: Dict[str, str]) -> str:
+    """Substitute only bare ``{slot}`` placeholders; everything else stays literal."""
+    def repl(match: "re.Match[str]") -> str:
+        key = match.group(1)
+        if key not in fill:
+            raise SynthkitError(f"template slot '{key}' is missing from 'slots'")
+        return str(fill[key])
+    out = _PLACEHOLDER.sub(repl, template)
+    if len(out) > _MAX_RECORD_CHARS:
+        raise SynthkitError(f"rendered record exceeds {_MAX_RECORD_CHARS} characters")
+    return out
+def _rule_response(resp_cfg: Dict[str, Any], fill: Dict[str, str]) -> str:
+    return render(str(resp_cfg.get("template", "")), fill)
+def _shape_record(kind: str, prompt: str, response: str,
+                  system: str, domain: str) -> Dict[str, Any]:
+    if kind == "instruction":
+        rec: Dict[str, Any] = {"instruction": prompt, "input": "", "output": response}
+        if system:
+            rec["system"] = system
+    else:
+        rec = {"prompt": prompt}
+    if domain:
+        rec["domain"] = domain
+    return rec
+def _response_for(kind: str, mode: str, resp_cfg: Dict[str, Any], fill: Dict[str, str],
+                  prompt: str, system: str, provider: Optional[Provider]) -> str:
+    if kind != "instruction":
+        return ""
+    if mode == "none":
+        return ""
+    if mode == "rule":
+        return _rule_response(resp_cfg, fill)
+    if mode == "provider":
+        return provider.generate(prompt, system)  # type: ignore[union-attr]
+    raise SynthkitError(f"unknown response.mode {mode!r}")
+def sample_prompts(spec: Dict[str, Any], n: int, *, seed: int = 17,
+                   dedup: bool = True,
+                   max_attempts: Optional[int] = None
+                   ) -> List[Tuple[str, Dict[str, str]]]:
+    """Phase 1: return up to n (prompt, slot-fill) pairs."""
+    templates = spec.get("templates") or []
+    if not templates:
+        raise SynthkitError("seed spec has no 'templates'")
+    slots: Dict[str, List[str]] = spec.get("slots") or {}
+    min_words = int((spec.get("constraints") or {}).get("min_words", 0))
+    rng = random.Random(seed)
+    out: List[Tuple[str, Dict[str, str]]] = []
+    seen: set = set()
+    attempts = 0
+    cap = max_attempts if max_attempts is not None else max(n * 50, 200)
+    while len(out) < n and attempts < cap:
+        attempts += 1
+        template = rng.choice(templates)
+        fill = {k: rng.choice(v) for k, v in slots.items()}
+        prompt = render(template, fill)
+        if len(prompt.split()) < min_words:
+            continue
+        if dedup:
+            key = " ".join(prompt.lower().split())
+            if key in seen:
+                continue
+            seen.add(key)
+        out.append((prompt, fill))
+    return out
+def _generate_dedup(spec, n, provider, seed, dedup, progress,
+                    embedder: Embedder, threshold: float,
+                    stats: Optional[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    kind = spec.get("kind", "eval")
+    system = spec.get("system", "")
+    domain = spec.get("domain", "")
+    resp_cfg = spec.get("response") or {"mode": "none"}
+    mode = resp_cfg.get("mode", "none")
+    pool = sample_prompts(spec, max(n * 5, n + 100), seed=seed, dedup=dedup)
+    records: List[Dict[str, Any]] = []
+    units: List[List[float]] = []
+    rejected = attempts = 0
+    for prompt, fill in pool:
+        if len(records) >= n:
+            break
+        attempts += 1
+        resp = _response_for(kind, mode, resp_cfg, fill, prompt, system, provider)
+        rec = _shape_record(kind, prompt, resp, system, domain)
+        u = unit(embedder.embed([record_text(rec, None)])[0])
+        if units and max_cosine(u, units) >= threshold:
+            rejected += 1
+        else:
+            records.append(rec)
+            units.append(u)
+        if progress:
+            progress(len(records), n)
+    if stats is not None:
+        stats["rejected_semantic"] = rejected
+        stats["attempts"] = attempts
+        stats["pool"] = len(pool)
+    return records
+def generate(spec: Dict[str, Any], n: int, *, provider: Optional[Provider] = None,
+             seed: int = 17, dedup: bool = True,
+             max_attempts: Optional[int] = None, concurrency: int = 1,
+             progress: Optional[Callable[[int, int], None]] = None,
+             dedup_embedder: Optional[Embedder] = None,
+             dedup_threshold: float = 0.9,
+             stats: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
+    kind = spec.get("kind", "eval")
+    system = spec.get("system", "")
+    domain = spec.get("domain", "")
+    resp_cfg = spec.get("response") or {"mode": "none"}
+    mode = resp_cfg.get("mode", "none")
+    if mode == "provider" and provider is None:
+        raise SynthkitError("response.mode is 'provider' but no provider was selected "
+                            "(pass --provider ollama|anthropic|openai)")
+    if dedup_embedder is not None:
+        return _generate_dedup(spec, n, provider, seed, dedup, progress,
+                               dedup_embedder, dedup_threshold, stats)
+    prompts = sample_prompts(spec, n, seed=seed, dedup=dedup, max_attempts=max_attempts)
+    responses: List[str] = [""] * len(prompts)
+    if kind == "instruction" and mode != "none":
+        if mode == "rule":
+            responses = [_rule_response(resp_cfg, fill) for _, fill in prompts]
+        elif mode == "provider":
+            responses = pmap(lambda pf: provider.generate(pf[0], system),
+                             prompts, concurrency=concurrency, progress=progress)
+        else:
+            raise SynthkitError(f"unknown response.mode {mode!r}")
+    return [_shape_record(kind, prompt, resp, system, domain)
+            for (prompt, _fill), resp in zip(prompts, responses)]

synthkit/text/seeds.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""Built-in seed specs used by `synthkit text gen --demo`.
+A seed spec is plain data (works as JSON or YAML on disk too):
+  kind        "eval" (prompt-only) or "instruction" (instruction→output)
+  templates   sentence templates with {slot} placeholders
+  slots       lists of fillers, one per placeholder name
+  response    how to fill the output for instruction data:
+                {"mode": "none"}                  leave blank (eval sets)
+                {"mode": "rule", "template": ...} fill a string template (offline)
+                {"mode": "provider"}              call an LLM provider
+  constraints optional, e.g. {"min_words": 4}
+"""
+from __future__ import annotations
+_LANGS = ["Python", "JavaScript", "Rust", "Go", "TypeScript", "Java", "C++", "Ruby"]
+_TASKS = [
+    "reverse a string",
+    "check whether a number is prime",
+    "merge two sorted lists",
+    "find the longest common subsequence of two strings",
+    "parse an ISO-8601 date",
+    "debounce a function",
+    "flatten a deeply nested list",
+    "compute a moving average over a stream",
+    "detect a cycle in a linked list",
+    "implement binary search",
+]
+DEMO_EVAL = {
+    "kind": "eval",
+    "domain": "coding",
+    "templates": [
+        "Write a {language} function that {task}.",
+        "How would you {task} in {language}? Walk through your reasoning.",
+        "Review this {language} snippet that is meant to {task} and point out the bugs.",
+        "Explain to a beginner how to {task} using {language}.",
+        "What's the most efficient way to {task} in {language}, and why?",
+        "Refactor a {language} program that {task} to be more readable.",
+    ],
+    "slots": {"language": _LANGS, "task": _TASKS},
+    "constraints": {"min_words": 4},
+    "response": {"mode": "none"},
+}
+DEMO_INSTRUCTION = {
+    "kind": "instruction",
+    "domain": "coding",
+    "system": "You are a precise, helpful coding assistant.",
+    "templates": [
+        "Write a {language} function that {task}.",
+        "Show me how to {task} in {language}.",
+        "I need {language} code to {task}. Include a short explanation.",
+    ],
+    "slots": {"language": _LANGS, "task": _TASKS},
+    "constraints": {"min_words": 4},
+    "response": {
+        "mode": "rule",
+        "template": "Here's an approach in {language} to {task}: start by clarifying "
+                    "the inputs and edge cases, then implement the core logic and test it.",
+    },
+}
+BUILTIN_SEEDS = {"eval": DEMO_EVAL, "instruction": DEMO_INSTRUCTION}

synthkit/util.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Small shared utilities."""
+from __future__ import annotations
+import math
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Callable, Iterable, List, Optional, Sequence, TypeVar
+T = TypeVar("T")
+R = TypeVar("R")
+def pmap(fn: Callable[[T], R], items: Iterable[T], concurrency: int = 1,
+         progress: Optional[Callable[[int, int], None]] = None) -> List[R]:
+    """Map fn over items, optionally across threads, preserving input order.
+    Exceptions propagate (the first one raised wins). `progress(done, total)`
+    is called after each item completes.
+    """
+    items = list(items)
+    total = len(items)
+    results: List[Optional[R]] = [None] * total
+    if concurrency <= 1:
+        for i, it in enumerate(items):
+            results[i] = fn(it)
+            if progress:
+                progress(i + 1, total)
+        return results  # type: ignore[return-value]
+    done = 0
+    with ThreadPoolExecutor(max_workers=concurrency) as ex:
+        futs = {ex.submit(fn, it): i for i, it in enumerate(items)}
+        for fut in as_completed(futs):
+            results[futs[fut]] = fut.result()
+            done += 1
+            if progress:
+                progress(done, total)
+    return results  # type: ignore[return-value]
+def unit(vec: Sequence[float]) -> List[float]:
+    """L2-normalize a vector (a zero vector maps to itself)."""
+    norm = math.sqrt(sum(x * x for x in vec)) or 1.0
+    return [x / norm for x in vec]
+def max_cosine(u: Sequence[float], units: Sequence[Sequence[float]]) -> float:
+    """Max cosine similarity between unit vector u and a list of unit vectors."""
+    best = 0.0
+    for w in units:
+        s = 0.0
+        for a, b in zip(u, w):
+            s += a * b
+        if s > best:
+            best = s
+    return best