Sync package to GitHub source: em-dashes out of rendered output; no API/logic change
Browse files- synthkit/__init__.py +13 -0
- synthkit/__main__.py +5 -0
- synthkit/cli.py +260 -0
- synthkit/formats.py +54 -0
- synthkit/grading.py +422 -0
- synthkit/io_utils.py +68 -0
- synthkit/models.py +61 -0
- synthkit/privacy/__init__.py +6 -0
- synthkit/providers.py +170 -0
- synthkit/report.py +164 -0
- synthkit/tabular/__init__.py +6 -0
- synthkit/text/__init__.py +1 -0
- synthkit/text/generate.py +175 -0
- synthkit/text/seeds.py +64 -0
- synthkit/util.py +55 -0
synthkit/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""synthkit: generate synthetic data and grade it for quality.
|
| 2 |
+
|
| 3 |
+
Three products on one core:
|
| 4 |
+
• text (A): instruction / eval datasets for training & evaluating LLMs [live]
|
| 5 |
+
• tabular (B): schema-aware fixtures with referential integrity [roadmap]
|
| 6 |
+
• privacy (C): privacy-safe synthetic twins of real datasets [roadmap]
|
| 7 |
+
|
| 8 |
+
The generator layer differs per product; the grading engine, providers, and
|
| 9 |
+
report writers are shared.
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
__version__ = "0.4.0"
|
synthkit/__main__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""`python3 -m synthkit …`"""
|
| 2 |
+
from synthkit.cli import main
|
| 3 |
+
|
| 4 |
+
if __name__ == "__main__":
|
| 5 |
+
main()
|
synthkit/cli.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Command-line interface for synthkit."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import sys
|
| 6 |
+
from typing import List
|
| 7 |
+
|
| 8 |
+
from synthkit import __version__
|
| 9 |
+
from synthkit.formats import FORMATS, to_format
|
| 10 |
+
from synthkit.grading import grade_dataset
|
| 11 |
+
from synthkit.io_utils import load_spec, read_records, write_jsonl, write_text
|
| 12 |
+
from synthkit.models import SynthkitError
|
| 13 |
+
from synthkit.providers import get_embedder, get_provider
|
| 14 |
+
from synthkit.report import print_report, to_html, to_json
|
| 15 |
+
from synthkit.text.generate import generate
|
| 16 |
+
from synthkit.text.seeds import BUILTIN_SEEDS, DEMO_EVAL
|
| 17 |
+
|
| 18 |
+
_GRADE_ORDER = ["F", "D", "C", "B", "A", "A+"]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _progress(done: int, total: int) -> None:
|
| 22 |
+
end = "\n" if done >= total else ""
|
| 23 |
+
print(f"\r generated {done}/{total}", end=end, file=sys.stderr, flush=True)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _maybe_embedder(args):
|
| 27 |
+
if not getattr(args, "semantic", False):
|
| 28 |
+
return None
|
| 29 |
+
return get_embedder(getattr(args, "embed_provider", "ollama"),
|
| 30 |
+
getattr(args, "embed_model", ""))
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ---- text gen ----------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
def _run_demo(args) -> int:
|
| 36 |
+
print("synthkit demo: generating a coding eval set, then grading it…",
|
| 37 |
+
file=sys.stderr)
|
| 38 |
+
# Train and eval are drawn from DISJOINT tasks (a genuine held-out split), then a
|
| 39 |
+
# known handful of eval records are deliberately leaked into train, so the
|
| 40 |
+
# contamination axis reflects real leakage, not one generator overlapping itself.
|
| 41 |
+
tasks = DEMO_EVAL["slots"]["task"]
|
| 42 |
+
train_spec = {**DEMO_EVAL, "slots": {**DEMO_EVAL["slots"], "task": tasks[:7]}}
|
| 43 |
+
eval_spec = {**DEMO_EVAL, "slots": {**DEMO_EVAL["slots"], "task": tasks[7:]}}
|
| 44 |
+
data = generate(train_spec, 195, seed=17)
|
| 45 |
+
bench = generate(eval_spec, 40, seed=99)
|
| 46 |
+
leaks = [dict(r) for r in bench[:5]] # 5 genuine, verbatim leaks
|
| 47 |
+
data = data + leaks
|
| 48 |
+
write_jsonl("synthkit_demo.jsonl", data)
|
| 49 |
+
write_jsonl("synthkit_demo.benchmark.jsonl", bench)
|
| 50 |
+
|
| 51 |
+
report = grade_dataset(data, against=bench, ngram=args.ngram)
|
| 52 |
+
print_report(report, dataset="synthkit_demo.jsonl", use_color=not args.no_color)
|
| 53 |
+
write_text("synthkit_demo.report.json", to_json(report, "synthkit_demo.jsonl"))
|
| 54 |
+
write_text("synthkit_demo.report.html", to_html(report, "synthkit_demo.jsonl"))
|
| 55 |
+
print(f" wrote synthkit_demo.jsonl ({len(data)} records) + benchmark ({len(bench)}) "
|
| 56 |
+
"+ .report.json + .report.html", file=sys.stderr)
|
| 57 |
+
print(" note: eval uses HELD-OUT tasks; 5 records were deliberately leaked into "
|
| 58 |
+
"train, so contamination flags exactly those real leaks.", file=sys.stderr)
|
| 59 |
+
return 0
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def cmd_text_gen(args) -> int:
|
| 63 |
+
if args.demo:
|
| 64 |
+
return _run_demo(args)
|
| 65 |
+
if not args.seed:
|
| 66 |
+
sys.exit("error: provide --seed FILE (a JSON/YAML seed spec) or use --demo")
|
| 67 |
+
|
| 68 |
+
spec = load_spec(args.seed)
|
| 69 |
+
if args.kind:
|
| 70 |
+
spec["kind"] = args.kind
|
| 71 |
+
provider = get_provider(args.provider, args.model)
|
| 72 |
+
dedup_embedder = get_embedder(args.embed_provider, args.embed_model) if args.dedup_semantic else None
|
| 73 |
+
busy = args.provider != "none" or args.dedup_semantic
|
| 74 |
+
show = _progress if (busy and not args.no_progress) else None
|
| 75 |
+
gstats: dict = {}
|
| 76 |
+
|
| 77 |
+
print(f"synthkit: generating {args.num} records from {args.seed}…", file=sys.stderr)
|
| 78 |
+
data = generate(spec, args.num, provider=provider, seed=args.seed_int,
|
| 79 |
+
dedup=not args.no_dedup, concurrency=args.concurrency, progress=show,
|
| 80 |
+
dedup_embedder=dedup_embedder, dedup_threshold=args.dedup_threshold,
|
| 81 |
+
stats=gstats)
|
| 82 |
+
if args.dedup_semantic and gstats.get("rejected_semantic"):
|
| 83 |
+
print(f" semantic dedup: rejected {gstats['rejected_semantic']} of "
|
| 84 |
+
f"{gstats['attempts']} candidates (cosine ≥ {args.dedup_threshold})",
|
| 85 |
+
file=sys.stderr)
|
| 86 |
+
if len(data) < args.num:
|
| 87 |
+
reason = ("raise --dedup-threshold or add slot variety" if args.dedup_semantic
|
| 88 |
+
else "the seed's template×slot space is exhausted "
|
| 89 |
+
"(add slot variety or pass --no-dedup)")
|
| 90 |
+
print(f" note: produced {len(data)} of {args.num} requested, {reason}.",
|
| 91 |
+
file=sys.stderr)
|
| 92 |
+
|
| 93 |
+
out = args.out or "synth_text.jsonl"
|
| 94 |
+
write_jsonl(out, to_format(data, args.format))
|
| 95 |
+
print(f" wrote {len(data)} records to {out}"
|
| 96 |
+
+ (f" ({args.format} format)" if args.format != "raw" else ""))
|
| 97 |
+
|
| 98 |
+
if not args.no_grade:
|
| 99 |
+
against = read_records(args.against) if args.against else None
|
| 100 |
+
report = grade_dataset(data, against=against, ngram=args.ngram,
|
| 101 |
+
embedder=_maybe_embedder(args))
|
| 102 |
+
print_report(report, dataset=out, use_color=not args.no_color)
|
| 103 |
+
if args.report_json:
|
| 104 |
+
write_text(args.report_json, to_json(report, out))
|
| 105 |
+
if args.html:
|
| 106 |
+
write_text(args.html, to_html(report, out))
|
| 107 |
+
return 0
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
# ---- grade -------------------------------------------------------------------
|
| 111 |
+
|
| 112 |
+
def cmd_grade(args) -> int:
|
| 113 |
+
records = read_records(args.path)
|
| 114 |
+
if not records:
|
| 115 |
+
sys.exit(f"error: no records found in {args.path}")
|
| 116 |
+
against = read_records(args.against) if args.against else None
|
| 117 |
+
report = grade_dataset(records, fields=args.field or None,
|
| 118 |
+
against=against, ngram=args.ngram,
|
| 119 |
+
embedder=_maybe_embedder(args))
|
| 120 |
+
print_report(report, dataset=args.path, use_color=not args.no_color)
|
| 121 |
+
if args.json:
|
| 122 |
+
write_text(args.json, to_json(report, args.path))
|
| 123 |
+
print(f" JSON written to {args.json}")
|
| 124 |
+
if args.html:
|
| 125 |
+
write_text(args.html, to_html(report, args.path))
|
| 126 |
+
print(f" HTML written to {args.html}")
|
| 127 |
+
if args.min_grade:
|
| 128 |
+
if _GRADE_ORDER.index(report.grade) < _GRADE_ORDER.index(args.min_grade):
|
| 129 |
+
print(f" grade {report.grade} is below --min-grade {args.min_grade}",
|
| 130 |
+
file=sys.stderr)
|
| 131 |
+
return 1
|
| 132 |
+
return 0
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# ---- list / roadmap ----------------------------------------------------------
|
| 136 |
+
|
| 137 |
+
def cmd_list(args) -> int:
|
| 138 |
+
print("\nsynthkit products\n")
|
| 139 |
+
print(" text (A) live LLM instruction & eval datasets + quality grading")
|
| 140 |
+
print(" tabular (B) roadmap schema-aware fixtures with referential integrity")
|
| 141 |
+
print(" privacy (C) roadmap privacy-safe synthetic twins of real datasets")
|
| 142 |
+
print("\n built-in text seeds (use with: text gen --demo, or copy from examples/)\n")
|
| 143 |
+
for name, spec in BUILTIN_SEEDS.items():
|
| 144 |
+
t = len(spec["templates"])
|
| 145 |
+
slots = " × ".join(f"{len(v)} {k}" for k, v in spec["slots"].items())
|
| 146 |
+
print(f" {name:<12} {spec['kind']:<12} {t} templates · {slots}")
|
| 147 |
+
print()
|
| 148 |
+
return 0
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def cmd_coming_soon(args) -> int:
|
| 152 |
+
print(f"\n synthkit {args.product}: {args.blurb}")
|
| 153 |
+
print(" On the roadmap. Product A (`synthkit text`) is live today and B/C share")
|
| 154 |
+
print(" the same core: providers, the grading engine, and the report writer.\n")
|
| 155 |
+
return 0
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# ---- parser ------------------------------------------------------------------
|
| 159 |
+
|
| 160 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 161 |
+
p = argparse.ArgumentParser(
|
| 162 |
+
prog="synthkit",
|
| 163 |
+
description="Generate synthetic data and grade it for quality "
|
| 164 |
+
"(validity, uniqueness, diversity, contamination).")
|
| 165 |
+
p.add_argument("--version", action="version", version=f"synthkit {__version__}")
|
| 166 |
+
sub = p.add_subparsers(dest="cmd")
|
| 167 |
+
|
| 168 |
+
# text (Product A) with its own subcommands
|
| 169 |
+
text_p = sub.add_parser("text", help="Product A: LLM instruction & eval datasets")
|
| 170 |
+
text_sub = text_p.add_subparsers(dest="text_cmd")
|
| 171 |
+
gen = text_sub.add_parser("gen", help="generate a text dataset and grade it")
|
| 172 |
+
gen.add_argument("--demo", action="store_true",
|
| 173 |
+
help="generate + grade a built-in coding eval set (no setup)")
|
| 174 |
+
gen.add_argument("--seed", help="path to a JSON/YAML seed spec")
|
| 175 |
+
gen.add_argument("-n", "--num", type=int, default=200, help="records to generate")
|
| 176 |
+
gen.add_argument("--kind", choices=["eval", "instruction"],
|
| 177 |
+
help="override the spec's kind")
|
| 178 |
+
gen.add_argument("--provider", default="none",
|
| 179 |
+
choices=["none", "ollama", "anthropic", "openai"],
|
| 180 |
+
help="response generator for instruction data (default: none)")
|
| 181 |
+
gen.add_argument("--model", default="", help="model name for the provider")
|
| 182 |
+
gen.add_argument("-o", "--out", help="output JSONL path (default: synth_text.jsonl)")
|
| 183 |
+
gen.add_argument("--seed-int", type=int, default=17, help="RNG seed (default: 17)")
|
| 184 |
+
gen.add_argument("--no-dedup", action="store_true",
|
| 185 |
+
help="keep exact-duplicate prompts instead of skipping them")
|
| 186 |
+
gen.add_argument("--no-grade", action="store_true", help="skip grading the output")
|
| 187 |
+
gen.add_argument("--against", help="held-out eval set to check contamination against")
|
| 188 |
+
gen.add_argument("--ngram", type=int, default=8, help="contamination n-gram size")
|
| 189 |
+
gen.add_argument("--format", default="raw", choices=list(FORMATS),
|
| 190 |
+
help="output schema: raw|alpaca|sharegpt|openai (default: raw)")
|
| 191 |
+
gen.add_argument("--concurrency", type=int, default=4,
|
| 192 |
+
help="parallel provider calls when filling responses (default: 4)")
|
| 193 |
+
gen.add_argument("--no-progress", action="store_true",
|
| 194 |
+
help="hide the response progress line")
|
| 195 |
+
gen.add_argument("--semantic", action="store_true",
|
| 196 |
+
help="add an embedding-based semantic-dedup axis to grading")
|
| 197 |
+
gen.add_argument("--embed-provider", default="ollama", choices=["ollama", "openai"],
|
| 198 |
+
help="embedder for --semantic (default: ollama)")
|
| 199 |
+
gen.add_argument("--embed-model", default="", help="embedding model name")
|
| 200 |
+
gen.add_argument("--dedup-semantic", action="store_true",
|
| 201 |
+
help="reject semantically-similar records during generation "
|
| 202 |
+
"(clean-by-construction; uses --embed-provider)")
|
| 203 |
+
gen.add_argument("--dedup-threshold", type=float, default=0.9,
|
| 204 |
+
help="cosine ≥ this ⇒ reject as a semantic duplicate (default: 0.9)")
|
| 205 |
+
gen.add_argument("--report-json", help="write the grade report as JSON")
|
| 206 |
+
gen.add_argument("--html", help="write the grade report as HTML")
|
| 207 |
+
gen.add_argument("--no-color", action="store_true", help="disable ANSI colors")
|
| 208 |
+
gen.set_defaults(func=cmd_text_gen)
|
| 209 |
+
|
| 210 |
+
# grade (shared across all products)
|
| 211 |
+
gr = sub.add_parser("grade", help="grade any dataset (jsonl/json/csv) for quality")
|
| 212 |
+
gr.add_argument("path", help="dataset to grade")
|
| 213 |
+
gr.add_argument("--against", help="held-out eval set for the contamination check")
|
| 214 |
+
gr.add_argument("--field", action="append", default=[],
|
| 215 |
+
help="field(s) to analyze (repeatable; default: auto-detect)")
|
| 216 |
+
gr.add_argument("--ngram", type=int, default=8, help="contamination n-gram size")
|
| 217 |
+
gr.add_argument("--min-grade", choices=_GRADE_ORDER,
|
| 218 |
+
help="exit non-zero if the grade is below this (CI gate)")
|
| 219 |
+
gr.add_argument("--semantic", action="store_true",
|
| 220 |
+
help="add an embedding-based semantic-dedup axis")
|
| 221 |
+
gr.add_argument("--embed-provider", default="ollama", choices=["ollama", "openai"],
|
| 222 |
+
help="embedder for --semantic (default: ollama)")
|
| 223 |
+
gr.add_argument("--embed-model", default="", help="embedding model name")
|
| 224 |
+
gr.add_argument("--json", help="write the report as JSON")
|
| 225 |
+
gr.add_argument("--html", help="write the report as HTML")
|
| 226 |
+
gr.add_argument("--no-color", action="store_true", help="disable ANSI colors")
|
| 227 |
+
gr.set_defaults(func=cmd_grade)
|
| 228 |
+
|
| 229 |
+
# list
|
| 230 |
+
ls = sub.add_parser("list", help="list products and built-in seeds")
|
| 231 |
+
ls.set_defaults(func=cmd_list)
|
| 232 |
+
|
| 233 |
+
# roadmap stubs
|
| 234 |
+
tb = sub.add_parser("tabular", help="Product B: schema-aware fixtures [roadmap]")
|
| 235 |
+
tb.set_defaults(func=cmd_coming_soon, product="tabular",
|
| 236 |
+
blurb="schema-aware fixtures with referential integrity")
|
| 237 |
+
pv = sub.add_parser("privacy", help="Product C: privacy-safe twins [roadmap]")
|
| 238 |
+
pv.set_defaults(func=cmd_coming_soon, product="privacy",
|
| 239 |
+
blurb="privacy-safe synthetic twins of real datasets")
|
| 240 |
+
|
| 241 |
+
return p
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def main(argv: List[str] = None) -> None:
|
| 245 |
+
parser = build_parser()
|
| 246 |
+
args = parser.parse_args(argv)
|
| 247 |
+
if not getattr(args, "cmd", None):
|
| 248 |
+
parser.print_help()
|
| 249 |
+
sys.exit(0)
|
| 250 |
+
# bare `synthkit text` with no subcommand
|
| 251 |
+
if args.cmd == "text" and not getattr(args, "func", None):
|
| 252 |
+
parser.parse_args(["text", "--help"])
|
| 253 |
+
try:
|
| 254 |
+
sys.exit(args.func(args))
|
| 255 |
+
except SynthkitError as exc:
|
| 256 |
+
sys.exit(f"error: {exc}")
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
if __name__ == "__main__":
|
| 260 |
+
main()
|
synthkit/formats.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Convert generated records into common fine-tuning dataset formats.
|
| 2 |
+
|
| 3 |
+
raw what the generator emits (alpaca-ish: instruction/input/output, or prompt)
|
| 4 |
+
alpaca {instruction, input, output}
|
| 5 |
+
sharegpt {conversations: [{from: human, value}, {from: gpt, value}]}
|
| 6 |
+
openai {messages: [{role: system?}, {role: user}, {role: assistant}]}
|
| 7 |
+
|
| 8 |
+
Eval (prompt-only) records keep their prompt as the human/user turn with an
|
| 9 |
+
empty completion, so the same dataset can drive evaluation or be completed later.
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from typing import Any, Dict, List
|
| 14 |
+
|
| 15 |
+
from synthkit.models import SynthkitError
|
| 16 |
+
|
| 17 |
+
FORMATS = ("raw", "alpaca", "sharegpt", "openai")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _parts(rec: Dict[str, Any]):
|
| 21 |
+
system = rec.get("system", "")
|
| 22 |
+
instruction = rec.get("instruction", rec.get("prompt", ""))
|
| 23 |
+
user_input = rec.get("input", "")
|
| 24 |
+
output = rec.get("output", rec.get("response", ""))
|
| 25 |
+
user = instruction if not user_input else f"{instruction}\n\n{user_input}"
|
| 26 |
+
return system, instruction, user_input, user, output
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def to_format(records: List[Dict[str, Any]], fmt: str) -> List[Dict[str, Any]]:
|
| 30 |
+
if fmt not in FORMATS:
|
| 31 |
+
raise SynthkitError(f"unknown --format {fmt!r} (choose from {', '.join(FORMATS)})")
|
| 32 |
+
if fmt == "raw":
|
| 33 |
+
return records
|
| 34 |
+
|
| 35 |
+
out: List[Dict[str, Any]] = []
|
| 36 |
+
for rec in records:
|
| 37 |
+
system, instruction, user_input, user, output = _parts(rec)
|
| 38 |
+
if fmt == "alpaca":
|
| 39 |
+
out.append({"instruction": instruction, "input": user_input, "output": output})
|
| 40 |
+
elif fmt == "sharegpt":
|
| 41 |
+
convo = []
|
| 42 |
+
if system:
|
| 43 |
+
convo.append({"from": "system", "value": system})
|
| 44 |
+
convo.append({"from": "human", "value": user})
|
| 45 |
+
convo.append({"from": "gpt", "value": output})
|
| 46 |
+
out.append({"conversations": convo})
|
| 47 |
+
elif fmt == "openai":
|
| 48 |
+
msgs = []
|
| 49 |
+
if system:
|
| 50 |
+
msgs.append({"role": "system", "content": system})
|
| 51 |
+
msgs.append({"role": "user", "content": user})
|
| 52 |
+
msgs.append({"role": "assistant", "content": output})
|
| 53 |
+
out.append({"messages": msgs})
|
| 54 |
+
return out
|
synthkit/grading.py
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""The grading engine, the part every synthkit product shares.
|
| 2 |
+
|
| 3 |
+
Given a list of records, score the dataset on four axes:
|
| 4 |
+
|
| 5 |
+
validity structurally sound records (required fields, non-empty, sane length)
|
| 6 |
+
uniqueness free of exact and near-duplicate records (MinHash + LSH)
|
| 7 |
+
diversity lexical variety across the set (distinct-n, self-similarity)
|
| 8 |
+
contamination overlap with a held-out eval/benchmark set (n-gram containment)
|
| 9 |
+
|
| 10 |
+
Everything here is standard-library only and deterministic for a fixed seed.
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import hashlib
|
| 15 |
+
import math
|
| 16 |
+
import random
|
| 17 |
+
import re
|
| 18 |
+
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
|
| 19 |
+
|
| 20 |
+
from synthkit.models import DimensionScore, GradeReport, to_grade
|
| 21 |
+
|
| 22 |
+
_MERSENNE = (1 << 61) - 1
|
| 23 |
+
_WORD = re.compile(r"[a-z0-9]+")
|
| 24 |
+
# Content fields we analyze by default (role/config keys like "system" excluded).
|
| 25 |
+
_KNOWN_FIELDS = ("instruction", "input", "prompt", "question",
|
| 26 |
+
"output", "response", "answer", "text")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ---- text helpers ------------------------------------------------------------
|
| 30 |
+
|
| 31 |
+
def record_text(rec: Dict[str, Any], fields: Optional[Sequence[str]]) -> str:
|
| 32 |
+
"""Flatten the fields we analyze into a single string."""
|
| 33 |
+
if fields:
|
| 34 |
+
vals = [rec.get(f, "") for f in fields]
|
| 35 |
+
else:
|
| 36 |
+
keys = [k for k in _KNOWN_FIELDS if k in rec]
|
| 37 |
+
if not keys:
|
| 38 |
+
keys = [k for k, v in rec.items() if isinstance(v, str)]
|
| 39 |
+
vals = [rec.get(k, "") for k in keys]
|
| 40 |
+
return " ".join(str(v) for v in vals if v is not None)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def tokens(text: str) -> List[str]:
|
| 44 |
+
return _WORD.findall(text.lower())
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _stable_hash(s: str) -> int:
|
| 48 |
+
return int.from_bytes(hashlib.blake2b(s.encode("utf-8"), digest_size=8).digest(), "big")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def shingles(toks: Sequence[str], k: int) -> Set[str]:
|
| 52 |
+
if not toks:
|
| 53 |
+
return set()
|
| 54 |
+
if len(toks) < k:
|
| 55 |
+
return {" ".join(toks)}
|
| 56 |
+
return {" ".join(toks[i:i + k]) for i in range(len(toks) - k + 1)}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def ngram_set(toks: Sequence[str], n: int) -> Set[Tuple[str, ...]]:
|
| 60 |
+
"""Word n-grams as tuples; items shorter than n contribute one whole tuple."""
|
| 61 |
+
if not toks:
|
| 62 |
+
return set()
|
| 63 |
+
if len(toks) <= n:
|
| 64 |
+
return {tuple(toks)}
|
| 65 |
+
return {tuple(toks[i:i + n]) for i in range(len(toks) - n + 1)}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# ---- MinHash + LSH near-duplicate detection ----------------------------------
|
| 69 |
+
|
| 70 |
+
class _MinHasher:
|
| 71 |
+
def __init__(self, num_perm: int, seed: int) -> None:
|
| 72 |
+
rng = random.Random(seed)
|
| 73 |
+
self.a = [rng.randrange(1, _MERSENNE) for _ in range(num_perm)]
|
| 74 |
+
self.b = [rng.randrange(0, _MERSENNE) for _ in range(num_perm)]
|
| 75 |
+
|
| 76 |
+
def sign(self, shs: Set[str]) -> Optional[Tuple[int, ...]]:
|
| 77 |
+
if not shs:
|
| 78 |
+
return None
|
| 79 |
+
base = [_stable_hash(s) for s in shs]
|
| 80 |
+
return tuple(min((a * h + b) % _MERSENNE for h in base)
|
| 81 |
+
for a, b in zip(self.a, self.b))
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class _UnionFind:
|
| 85 |
+
def __init__(self, n: int) -> None:
|
| 86 |
+
self.parent = list(range(n))
|
| 87 |
+
|
| 88 |
+
def find(self, x: int) -> int:
|
| 89 |
+
while self.parent[x] != x:
|
| 90 |
+
self.parent[x] = self.parent[self.parent[x]]
|
| 91 |
+
x = self.parent[x]
|
| 92 |
+
return x
|
| 93 |
+
|
| 94 |
+
def union(self, x: int, y: int) -> None:
|
| 95 |
+
rx, ry = self.find(x), self.find(y)
|
| 96 |
+
if rx == ry:
|
| 97 |
+
return
|
| 98 |
+
# keep the smaller index as root (the "original" of the cluster)
|
| 99 |
+
if rx < ry:
|
| 100 |
+
self.parent[ry] = rx
|
| 101 |
+
else:
|
| 102 |
+
self.parent[rx] = ry
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class _LSHIndex:
|
| 106 |
+
"""MinHash + LSH index over a list of shingle sets.
|
| 107 |
+
|
| 108 |
+
Build once, then `candidates(shingles)` returns the (small) set of indices that
|
| 109 |
+
share at least one band with the query, turning all-pairs similarity work into
|
| 110 |
+
near-linear candidate lookups. Used for both near-dup detection and the
|
| 111 |
+
contamination check so neither is quadratic in the dataset size.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
def __init__(self, shingle_sets: List[Set[str]], *, num_perm: int = 64,
|
| 115 |
+
bands: int = 16, seed: int = 17) -> None:
|
| 116 |
+
self._hasher = _MinHasher(num_perm, seed)
|
| 117 |
+
self._bands = bands
|
| 118 |
+
self._rows = num_perm // bands
|
| 119 |
+
self._buckets: Dict[Tuple[int, Tuple[int, ...]], List[int]] = {}
|
| 120 |
+
for idx, shingset in enumerate(shingle_sets):
|
| 121 |
+
sig = self._hasher.sign(shingset)
|
| 122 |
+
if sig is None:
|
| 123 |
+
continue
|
| 124 |
+
for band in range(bands):
|
| 125 |
+
key = (band, sig[band * self._rows:(band + 1) * self._rows])
|
| 126 |
+
self._buckets.setdefault(key, []).append(idx)
|
| 127 |
+
|
| 128 |
+
def candidates(self, shingset: Set[str]) -> Set[int]:
|
| 129 |
+
sig = self._hasher.sign(shingset)
|
| 130 |
+
if sig is None:
|
| 131 |
+
return set()
|
| 132 |
+
out: Set[int] = set()
|
| 133 |
+
for band in range(self._bands):
|
| 134 |
+
key = (band, sig[band * self._rows:(band + 1) * self._rows])
|
| 135 |
+
out.update(self._buckets.get(key, ()))
|
| 136 |
+
return out
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def _duplicate_map(shingle_sets: List[Set[str]], *, threshold: float = 0.8,
|
| 140 |
+
seed: int = 17) -> Dict[int, int]:
|
| 141 |
+
"""Return {pos: root_pos} for every entry that near-duplicates an earlier one."""
|
| 142 |
+
index = _LSHIndex(shingle_sets, seed=seed)
|
| 143 |
+
uf = _UnionFind(len(shingle_sets))
|
| 144 |
+
for i, shingset in enumerate(shingle_sets):
|
| 145 |
+
if not shingset:
|
| 146 |
+
continue
|
| 147 |
+
for j in index.candidates(shingset):
|
| 148 |
+
if j >= i:
|
| 149 |
+
continue
|
| 150 |
+
other = shingle_sets[j]
|
| 151 |
+
if other and len(shingset & other) / len(shingset | other) >= threshold:
|
| 152 |
+
uf.union(i, j)
|
| 153 |
+
|
| 154 |
+
dup_of: Dict[int, int] = {}
|
| 155 |
+
for idx in range(len(shingle_sets)):
|
| 156 |
+
root = uf.find(idx)
|
| 157 |
+
if root != idx:
|
| 158 |
+
dup_of[idx] = root
|
| 159 |
+
return dup_of
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# ---- the four dimensions -----------------------------------------------------
|
| 163 |
+
|
| 164 |
+
def _validity_dim(records, texts, *, min_words, max_words) -> DimensionScore:
|
| 165 |
+
n = len(records)
|
| 166 |
+
empty = short = long = 0
|
| 167 |
+
examples: List[str] = []
|
| 168 |
+
for text in texts:
|
| 169 |
+
wc = len(tokens(text))
|
| 170 |
+
if not text.strip():
|
| 171 |
+
empty += 1
|
| 172 |
+
if len(examples) < 3:
|
| 173 |
+
examples.append("empty record")
|
| 174 |
+
elif wc < min_words:
|
| 175 |
+
short += 1
|
| 176 |
+
if len(examples) < 3:
|
| 177 |
+
examples.append(f"only {wc} words: {text[:60]!r}")
|
| 178 |
+
elif wc > max_words:
|
| 179 |
+
long += 1
|
| 180 |
+
if len(examples) < 3:
|
| 181 |
+
examples.append(f"{wc} words (over {max_words})")
|
| 182 |
+
bad = empty + short + long
|
| 183 |
+
score = 100.0 * (1 - bad / n) if n else 0.0
|
| 184 |
+
findings: List[str] = []
|
| 185 |
+
if empty:
|
| 186 |
+
findings.append(f"{empty} empty record(s)")
|
| 187 |
+
if short:
|
| 188 |
+
findings.append(f"{short} below {min_words} words")
|
| 189 |
+
if long:
|
| 190 |
+
findings.append(f"{long} above {max_words} words")
|
| 191 |
+
findings += examples
|
| 192 |
+
return DimensionScore(
|
| 193 |
+
"validity", "Validity", round(score, 1),
|
| 194 |
+
f"{n - bad}/{n} records well-formed", findings,
|
| 195 |
+
{"empty": empty, "too_short": short, "too_long": long, "n": n}, weight=0.25)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def _uniqueness_dim(records, texts, shingle_sets, *, seed) -> DimensionScore:
|
| 199 |
+
n = len(records)
|
| 200 |
+
seen: Dict[str, int] = {}
|
| 201 |
+
exact_dup: Set[int] = set()
|
| 202 |
+
rep_indices: List[int] = []
|
| 203 |
+
for idx, text in enumerate(texts):
|
| 204 |
+
norm = " ".join(tokens(text))
|
| 205 |
+
if norm and norm in seen:
|
| 206 |
+
exact_dup.add(idx)
|
| 207 |
+
else:
|
| 208 |
+
if norm:
|
| 209 |
+
seen[norm] = idx
|
| 210 |
+
rep_indices.append(idx)
|
| 211 |
+
# near-dup search runs only over exact-unique representatives (keeps it cheap)
|
| 212 |
+
rep_shingles = [shingle_sets[i] for i in rep_indices]
|
| 213 |
+
local_dup = _duplicate_map(rep_shingles, seed=seed)
|
| 214 |
+
near_dup: Set[int] = set()
|
| 215 |
+
clusters: Set[int] = set()
|
| 216 |
+
example = ""
|
| 217 |
+
for local_idx, local_root in local_dup.items():
|
| 218 |
+
gi, gr = rep_indices[local_idx], rep_indices[local_root]
|
| 219 |
+
near_dup.add(gi)
|
| 220 |
+
clusters.add(gr)
|
| 221 |
+
if not example:
|
| 222 |
+
example = f"e.g. #{gi} ≈ #{gr}: {texts[gi][:64]!r}"
|
| 223 |
+
dup_total = len(exact_dup) + len(near_dup)
|
| 224 |
+
score = 100.0 * (1 - dup_total / n) if n else 0.0
|
| 225 |
+
findings: List[str] = []
|
| 226 |
+
if exact_dup:
|
| 227 |
+
findings.append(f"{len(exact_dup)} exact duplicate(s)")
|
| 228 |
+
if near_dup:
|
| 229 |
+
findings.append(f"{len(near_dup)} near-duplicate(s) in {len(clusters)} cluster(s)")
|
| 230 |
+
if example:
|
| 231 |
+
findings.append(example)
|
| 232 |
+
return DimensionScore(
|
| 233 |
+
"uniqueness", "Uniqueness", round(score, 1),
|
| 234 |
+
f"{n - dup_total}/{n} unique ({len(exact_dup)} exact, {len(near_dup)} near)",
|
| 235 |
+
findings,
|
| 236 |
+
{"exact": len(exact_dup), "near": len(near_dup),
|
| 237 |
+
"clusters": len(clusters), "n": n}, weight=0.30)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def _diversity_dim(texts, shingle_sets, *, seed) -> DimensionScore:
|
| 241 |
+
unigrams: List[str] = []
|
| 242 |
+
bigrams: List[Tuple[str, str]] = []
|
| 243 |
+
for text in texts:
|
| 244 |
+
ts = tokens(text)
|
| 245 |
+
unigrams.extend(ts)
|
| 246 |
+
bigrams.extend(zip(ts, ts[1:]))
|
| 247 |
+
d1 = len(set(unigrams)) / len(unigrams) if unigrams else 0.0
|
| 248 |
+
d2 = len(set(bigrams)) / len(bigrams) if bigrams else 0.0
|
| 249 |
+
# self-similarity: mean Jaccard over a seeded sample of record pairs
|
| 250 |
+
rng = random.Random(seed)
|
| 251 |
+
idxs = [i for i, s in enumerate(shingle_sets) if s]
|
| 252 |
+
sims: List[float] = []
|
| 253 |
+
if len(idxs) >= 2:
|
| 254 |
+
for _ in range(min(2000, len(idxs) * 4)):
|
| 255 |
+
i, j = rng.sample(idxs, 2)
|
| 256 |
+
a, b = shingle_sets[i], shingle_sets[j]
|
| 257 |
+
sims.append(len(a & b) / len(a | b))
|
| 258 |
+
self_sim = sum(sims) / len(sims) if sims else 0.0
|
| 259 |
+
# Pairwise distinctness (1 - self-similarity) is the size-stable signal and
|
| 260 |
+
# leads; distinct-2 is a secondary lexical-variety term with a lenient target
|
| 261 |
+
# (corpus-level distinct-n shrinks as the set grows). Raw numbers are reported
|
| 262 |
+
# in stats either way so the letter is never the whole story.
|
| 263 |
+
score = 100.0 * (0.6 * (1 - self_sim) + 0.4 * min(1, d2 / 0.25))
|
| 264 |
+
findings: List[str] = []
|
| 265 |
+
if d2 < 0.4:
|
| 266 |
+
findings.append("low bigram diversity, templates may be too repetitive")
|
| 267 |
+
if self_sim > 0.3:
|
| 268 |
+
findings.append(f"records are {self_sim * 100:.0f}% similar on average")
|
| 269 |
+
return DimensionScore(
|
| 270 |
+
"diversity", "Diversity", round(score, 1),
|
| 271 |
+
f"distinct-2 {d2:.2f} · distinct-1 {d1:.2f} · self-sim {self_sim:.2f}",
|
| 272 |
+
findings,
|
| 273 |
+
{"distinct_1": round(d1, 4), "distinct_2": round(d2, 4),
|
| 274 |
+
"self_similarity": round(self_sim, 4), "vocab": len(set(unigrams))},
|
| 275 |
+
weight=0.25)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def _contamination_dim(texts, shingle_sets, against_texts, *, ngram) -> DimensionScore:
|
| 279 |
+
if against_texts is None:
|
| 280 |
+
return DimensionScore(
|
| 281 |
+
"contamination", "Contamination", None,
|
| 282 |
+
"no eval set provided (pass --against to check)", [], {}, weight=0.20)
|
| 283 |
+
eval_ngrams: Set[Tuple[str, ...]] = set()
|
| 284 |
+
eval_shingles: List[Set[str]] = []
|
| 285 |
+
for t in against_texts:
|
| 286 |
+
ts = tokens(t)
|
| 287 |
+
eval_ngrams |= ngram_set(ts, ngram)
|
| 288 |
+
eval_shingles.append(shingles(ts, 5))
|
| 289 |
+
eval_index = _LSHIndex(eval_shingles, seed=17) # avoid the O(records×eval) scan
|
| 290 |
+
flagged: List[Tuple[int, str, str]] = []
|
| 291 |
+
for idx, text in enumerate(texts):
|
| 292 |
+
ts = tokens(text)
|
| 293 |
+
sh = shingle_sets[idx]
|
| 294 |
+
hit, reason = False, ""
|
| 295 |
+
if sh: # near-duplicate of an eval item
|
| 296 |
+
for j in eval_index.candidates(sh):
|
| 297 |
+
es = eval_shingles[j]
|
| 298 |
+
if es and len(sh & es) / len(sh | es) >= 0.7:
|
| 299 |
+
hit, reason = True, "near-duplicate of an eval item"
|
| 300 |
+
break
|
| 301 |
+
if not hit:
|
| 302 |
+
# n-gram containment: what fraction of THIS record's n-grams are in the
|
| 303 |
+
# eval set. Robust to shared template boilerplate (only a few n-grams),
|
| 304 |
+
# which a raw "shares any n-gram" check would over-flag.
|
| 305 |
+
grams = ngram_set(ts, ngram)
|
| 306 |
+
if grams:
|
| 307 |
+
contained = sum(1 for g in grams if g in eval_ngrams) / len(grams)
|
| 308 |
+
if contained >= 0.8:
|
| 309 |
+
hit = True
|
| 310 |
+
reason = f"{contained * 100:.0f}% of its {ngram}-grams are in the eval set"
|
| 311 |
+
if hit:
|
| 312 |
+
flagged.append((idx, reason, text[:64]))
|
| 313 |
+
n = len(texts)
|
| 314 |
+
score = 100.0 * (1 - len(flagged) / n) if n else 100.0
|
| 315 |
+
summary = (f"{len(flagged)}/{n} records overlap the eval set" if flagged
|
| 316 |
+
else f"clean: 0/{n} overlap the eval set")
|
| 317 |
+
findings = [f"#{i}: {why}: {snip!r}" for i, why, snip in flagged[:4]]
|
| 318 |
+
return DimensionScore(
|
| 319 |
+
"contamination", "Contamination", round(score, 1), summary, findings,
|
| 320 |
+
{"flagged": len(flagged), "n": n, "ngram": ngram}, weight=0.20)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
# ---- optional semantic axis (embedding-based) --------------------------------
|
| 324 |
+
|
| 325 |
+
def _unit(v: Sequence[float]) -> List[float]:
|
| 326 |
+
norm = math.sqrt(sum(x * x for x in v)) or 1.0
|
| 327 |
+
return [x / norm for x in v]
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def _cos_unit(a: Sequence[float], b: Sequence[float]) -> float:
|
| 331 |
+
return sum(x * y for x, y in zip(a, b))
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def _semantic_dups(units: List[List[float]], threshold: float) -> Set[int]:
|
| 335 |
+
n = len(units)
|
| 336 |
+
dup: Set[int] = set()
|
| 337 |
+
try:
|
| 338 |
+
import numpy as np # fast path if available
|
| 339 |
+
sims = np.asarray(units) @ np.asarray(units).T
|
| 340 |
+
for a in range(n):
|
| 341 |
+
if a in dup:
|
| 342 |
+
continue
|
| 343 |
+
row = sims[a]
|
| 344 |
+
for b in range(a + 1, n):
|
| 345 |
+
if b not in dup and row[b] >= threshold:
|
| 346 |
+
dup.add(b)
|
| 347 |
+
except ImportError:
|
| 348 |
+
for a in range(n):
|
| 349 |
+
if a in dup:
|
| 350 |
+
continue
|
| 351 |
+
ua = units[a]
|
| 352 |
+
for b in range(a + 1, n):
|
| 353 |
+
if b not in dup and _cos_unit(ua, units[b]) >= threshold:
|
| 354 |
+
dup.add(b)
|
| 355 |
+
return dup
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def _semantic_dim(texts, embedder, *, threshold, seed, max_n=400) -> DimensionScore:
|
| 359 |
+
idx = [i for i, t in enumerate(texts) if t.strip()]
|
| 360 |
+
note = ""
|
| 361 |
+
if len(idx) > max_n:
|
| 362 |
+
idx = sorted(random.Random(seed).sample(idx, max_n))
|
| 363 |
+
note = f" (sampled {max_n})"
|
| 364 |
+
units = [_unit(v) for v in embedder.embed([texts[i] for i in idx])]
|
| 365 |
+
dup = _semantic_dups(units, threshold)
|
| 366 |
+
rng = random.Random(seed)
|
| 367 |
+
sims = []
|
| 368 |
+
if len(units) >= 2:
|
| 369 |
+
for _ in range(min(2000, len(units) * 4)):
|
| 370 |
+
a, b = rng.sample(range(len(units)), 2)
|
| 371 |
+
sims.append(_cos_unit(units[a], units[b]))
|
| 372 |
+
mean_sim = sum(sims) / len(sims) if sims else 0.0
|
| 373 |
+
n = len(idx)
|
| 374 |
+
score = 100.0 * (1 - len(dup) / n) if n else 100.0
|
| 375 |
+
findings = []
|
| 376 |
+
if dup:
|
| 377 |
+
findings.append(f"{len(dup)} semantic near-duplicate(s) at cosine ≥ {threshold} "
|
| 378 |
+
"(paraphrases lexical dedup misses)")
|
| 379 |
+
summary = f"{n - len(dup)}/{n} semantically distinct · mean cosine {mean_sim:.2f}{note}"
|
| 380 |
+
return DimensionScore(
|
| 381 |
+
"semantic", "Semantic dedup", round(score, 1), summary, findings,
|
| 382 |
+
{"semantic_dups": len(dup), "mean_cosine": round(mean_sim, 4),
|
| 383 |
+
"n": n, "threshold": threshold}, weight=0.20)
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
# ---- public API --------------------------------------------------------------
|
| 387 |
+
|
| 388 |
+
def grade_dataset(records: List[Dict[str, Any]], *,
|
| 389 |
+
fields: Optional[Sequence[str]] = None,
|
| 390 |
+
against: Optional[List[Dict[str, Any]]] = None,
|
| 391 |
+
against_fields: Optional[Sequence[str]] = None,
|
| 392 |
+
ngram: int = 8, min_words: int = 3, max_words: int = 512,
|
| 393 |
+
seed: int = 17, embedder=None,
|
| 394 |
+
semantic_threshold: float = 0.83) -> GradeReport:
|
| 395 |
+
if not records:
|
| 396 |
+
return GradeReport(grade="F", score=0.0, n_records=0, dimensions=[],
|
| 397 |
+
meta={"note": "empty dataset"})
|
| 398 |
+
|
| 399 |
+
texts = [record_text(r, fields) for r in records]
|
| 400 |
+
shingle_sets = [shingles(tokens(t), 5) for t in texts]
|
| 401 |
+
against_texts = None
|
| 402 |
+
if against is not None:
|
| 403 |
+
against_texts = [record_text(r, against_fields or fields) for r in against]
|
| 404 |
+
|
| 405 |
+
dims = [
|
| 406 |
+
_validity_dim(records, texts, min_words=min_words, max_words=max_words),
|
| 407 |
+
_uniqueness_dim(records, texts, shingle_sets, seed=seed),
|
| 408 |
+
_diversity_dim(texts, shingle_sets, seed=seed),
|
| 409 |
+
_contamination_dim(texts, shingle_sets, against_texts, ngram=ngram),
|
| 410 |
+
]
|
| 411 |
+
if embedder is not None:
|
| 412 |
+
dims.append(_semantic_dim(texts, embedder, threshold=semantic_threshold, seed=seed))
|
| 413 |
+
|
| 414 |
+
applicable = [d for d in dims if d.applicable]
|
| 415 |
+
wsum = sum(d.weight for d in applicable) or 1.0
|
| 416 |
+
overall = sum(d.score * d.weight for d in applicable) / wsum
|
| 417 |
+
return GradeReport(
|
| 418 |
+
grade=to_grade(overall), score=round(overall, 1), n_records=len(records),
|
| 419 |
+
dimensions=dims,
|
| 420 |
+
meta={"fields": list(fields) if fields else "auto",
|
| 421 |
+
"has_eval": against_texts is not None,
|
| 422 |
+
"semantic": embedder is not None})
|
synthkit/io_utils.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Read and write datasets as JSONL, JSON, or CSV, standard library only."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import csv
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from typing import Any, Dict, List
|
| 8 |
+
|
| 9 |
+
from synthkit.models import SynthkitError
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _ext(path: str) -> str:
|
| 13 |
+
return os.path.splitext(path)[1].lower()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def read_records(path: str) -> List[Dict[str, Any]]:
|
| 17 |
+
"""Load records from .jsonl / .json / .csv / .tsv."""
|
| 18 |
+
ext = _ext(path)
|
| 19 |
+
with open(path, "r", encoding="utf-8") as fh:
|
| 20 |
+
if ext in (".jsonl", ".ndjson"):
|
| 21 |
+
return [json.loads(line) for line in fh if line.strip()]
|
| 22 |
+
if ext == ".json":
|
| 23 |
+
data = json.load(fh)
|
| 24 |
+
if isinstance(data, dict):
|
| 25 |
+
# common wrappers: {"data": [...]} / {"records": [...]}
|
| 26 |
+
for key in ("data", "records", "rows", "examples"):
|
| 27 |
+
if isinstance(data.get(key), list):
|
| 28 |
+
return data[key]
|
| 29 |
+
return [data]
|
| 30 |
+
return list(data)
|
| 31 |
+
if ext in (".csv", ".tsv"):
|
| 32 |
+
delim = "\t" if ext == ".tsv" else ","
|
| 33 |
+
return list(csv.DictReader(fh, delimiter=delim))
|
| 34 |
+
raise SynthkitError(f"unsupported input format {ext or path!r}")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def write_jsonl(path: str, records: List[Dict[str, Any]]) -> None:
|
| 38 |
+
_ensure_dir(path)
|
| 39 |
+
with open(path, "w", encoding="utf-8") as fh:
|
| 40 |
+
for r in records:
|
| 41 |
+
fh.write(json.dumps(r, ensure_ascii=False) + "\n")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def write_text(path: str, text: str) -> None:
|
| 45 |
+
_ensure_dir(path)
|
| 46 |
+
with open(path, "w", encoding="utf-8") as fh:
|
| 47 |
+
fh.write(text)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def load_spec(path: str) -> Dict[str, Any]:
|
| 51 |
+
"""Load a seed spec from JSON (always) or YAML (if pyyaml is installed)."""
|
| 52 |
+
ext = _ext(path)
|
| 53 |
+
with open(path, "r", encoding="utf-8") as fh:
|
| 54 |
+
if ext in (".yaml", ".yml"):
|
| 55 |
+
try:
|
| 56 |
+
import yaml # optional dependency
|
| 57 |
+
except ImportError as exc: # pragma: no cover
|
| 58 |
+
raise SynthkitError(
|
| 59 |
+
"reading YAML seeds needs pyyaml. "
|
| 60 |
+
"`pip install pyyaml`, or use a .json seed."
|
| 61 |
+
) from exc
|
| 62 |
+
return yaml.safe_load(fh)
|
| 63 |
+
return json.load(fh)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _ensure_dir(path: str) -> None:
|
| 67 |
+
d = os.path.dirname(os.path.abspath(path))
|
| 68 |
+
os.makedirs(d, exist_ok=True)
|
synthkit/models.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core data types shared across every synthkit product."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from dataclasses import dataclass, field
|
| 5 |
+
from typing import Any, Dict, List, Optional
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SynthkitError(Exception):
|
| 9 |
+
"""User-facing error (bad seed spec, unreadable input, provider failure).
|
| 10 |
+
|
| 11 |
+
Library code raises this instead of calling sys.exit/SystemExit, so that
|
| 12 |
+
callers embedding the library (the Gradio app, the tests) can catch it.
|
| 13 |
+
The CLI converts it into a clean non-zero exit.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Letter grades, best to worst, same scale as the rest of the portfolio.
|
| 18 |
+
GRADE_BANDS = [
|
| 19 |
+
(97, "A+"), (93, "A"), (85, "B"), (75, "C"), (65, "D"), (0, "F"),
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def to_grade(score: float) -> str:
|
| 24 |
+
for cutoff, letter in GRADE_BANDS:
|
| 25 |
+
if score >= cutoff:
|
| 26 |
+
return letter
|
| 27 |
+
return "F"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class DimensionScore:
|
| 32 |
+
"""One quality axis (validity, uniqueness, diversity, contamination)."""
|
| 33 |
+
|
| 34 |
+
key: str
|
| 35 |
+
title: str
|
| 36 |
+
score: Optional[float] # 0–100, or None when N/A
|
| 37 |
+
summary: str = ""
|
| 38 |
+
findings: List[str] = field(default_factory=list) # human-readable notes
|
| 39 |
+
stats: Dict[str, Any] = field(default_factory=dict) # raw numbers
|
| 40 |
+
weight: float = 1.0
|
| 41 |
+
|
| 42 |
+
@property
|
| 43 |
+
def applicable(self) -> bool:
|
| 44 |
+
return self.score is not None
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@dataclass
|
| 48 |
+
class GradeReport:
|
| 49 |
+
"""The graded result for a dataset."""
|
| 50 |
+
|
| 51 |
+
grade: str
|
| 52 |
+
score: float # 0–100 overall
|
| 53 |
+
n_records: int
|
| 54 |
+
dimensions: List[DimensionScore] = field(default_factory=list)
|
| 55 |
+
meta: Dict[str, Any] = field(default_factory=dict)
|
| 56 |
+
|
| 57 |
+
def dim(self, key: str) -> Optional[DimensionScore]:
|
| 58 |
+
for d in self.dimensions:
|
| 59 |
+
if d.key == key:
|
| 60 |
+
return d
|
| 61 |
+
return None
|
synthkit/privacy/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Product C: privacy-safe synthetic twins of real datasets. [roadmap]
|
| 2 |
+
|
| 3 |
+
Fits a generator to a real dataset, emits a statistically similar synthetic
|
| 4 |
+
copy, and adds privacy/utility dimensions (membership-inference distance,
|
| 5 |
+
column-distribution fidelity) to the shared grading report.
|
| 6 |
+
"""
|
synthkit/providers.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Response + embedding providers.
|
| 2 |
+
|
| 3 |
+
A *provider* turns a prompt into a response (for instruction→output pairs); an
|
| 4 |
+
*embedder* turns text into a vector (for the optional semantic-quality axis).
|
| 5 |
+
The 'none' path is stdlib; ollama is local & free; anthropic/openai are lazy
|
| 6 |
+
imports used only if selected.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
import urllib.error
|
| 13 |
+
import urllib.request
|
| 14 |
+
from typing import List, Optional
|
| 15 |
+
|
| 16 |
+
from synthkit.models import SynthkitError
|
| 17 |
+
|
| 18 |
+
# ---- HTTP helper with friendly Ollama errors ---------------------------------
|
| 19 |
+
|
| 20 |
+
def _post_json(url: str, payload: dict, timeout: int = 120) -> dict:
|
| 21 |
+
body = json.dumps(payload).encode("utf-8")
|
| 22 |
+
req = urllib.request.Request(url, data=body,
|
| 23 |
+
headers={"Content-Type": "application/json"})
|
| 24 |
+
try:
|
| 25 |
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
| 26 |
+
return json.loads(resp.read().decode("utf-8"))
|
| 27 |
+
except urllib.error.HTTPError as exc:
|
| 28 |
+
detail = exc.read().decode("utf-8", "ignore")[:200]
|
| 29 |
+
hint = ""
|
| 30 |
+
if exc.code == 404 and "model" in detail.lower():
|
| 31 |
+
hint = ". Pull it first with `ollama pull <model>`"
|
| 32 |
+
raise SynthkitError(f"Ollama returned HTTP {exc.code} from {url}{hint}\n {detail}")
|
| 33 |
+
except urllib.error.URLError as exc:
|
| 34 |
+
raise SynthkitError(
|
| 35 |
+
f"can't reach Ollama at {url} ({exc.reason}). "
|
| 36 |
+
"Is the daemon running? Start it with `ollama serve`.")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ---- response providers ------------------------------------------------------
|
| 40 |
+
|
| 41 |
+
class Provider:
|
| 42 |
+
name = "base"
|
| 43 |
+
|
| 44 |
+
def generate(self, prompt: str, system: str = "") -> str:
|
| 45 |
+
raise NotImplementedError
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class OllamaProvider(Provider):
|
| 49 |
+
"""Local, free responses via a running Ollama daemon."""
|
| 50 |
+
|
| 51 |
+
name = "ollama"
|
| 52 |
+
|
| 53 |
+
def __init__(self, model: str = "llama3.2", host: str = "") -> None:
|
| 54 |
+
self.model = model
|
| 55 |
+
self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/")
|
| 56 |
+
|
| 57 |
+
def generate(self, prompt: str, system: str = "") -> str:
|
| 58 |
+
data = _post_json(self.host + "/api/generate", {
|
| 59 |
+
"model": self.model, "prompt": prompt,
|
| 60 |
+
"system": system, "stream": False,
|
| 61 |
+
})
|
| 62 |
+
return (data.get("response") or "").strip()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class AnthropicProvider(Provider):
|
| 66 |
+
name = "anthropic"
|
| 67 |
+
|
| 68 |
+
def __init__(self, model: str = "claude-haiku-4-5-20251001") -> None:
|
| 69 |
+
try:
|
| 70 |
+
import anthropic
|
| 71 |
+
except ImportError as exc:
|
| 72 |
+
raise SynthkitError("--provider anthropic needs the anthropic SDK. "
|
| 73 |
+
"`pip install anthropic`.") from exc
|
| 74 |
+
self.model = model
|
| 75 |
+
self._client = anthropic.Anthropic()
|
| 76 |
+
|
| 77 |
+
def generate(self, prompt: str, system: str = "") -> str:
|
| 78 |
+
msg = self._client.messages.create(
|
| 79 |
+
model=self.model, max_tokens=1024,
|
| 80 |
+
system=system or "You are a helpful assistant.",
|
| 81 |
+
messages=[{"role": "user", "content": prompt}])
|
| 82 |
+
return "".join(b.text for b in msg.content
|
| 83 |
+
if getattr(b, "type", "") == "text").strip()
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class OpenAIProvider(Provider):
|
| 87 |
+
name = "openai"
|
| 88 |
+
|
| 89 |
+
def __init__(self, model: str = "gpt-4o-mini") -> None:
|
| 90 |
+
try:
|
| 91 |
+
import openai
|
| 92 |
+
except ImportError as exc:
|
| 93 |
+
raise SynthkitError("--provider openai needs the openai SDK. "
|
| 94 |
+
"`pip install openai`.") from exc
|
| 95 |
+
self.model = model
|
| 96 |
+
self._client = openai.OpenAI()
|
| 97 |
+
|
| 98 |
+
def generate(self, prompt: str, system: str = "") -> str:
|
| 99 |
+
resp = self._client.chat.completions.create(
|
| 100 |
+
model=self.model,
|
| 101 |
+
messages=[{"role": "system", "content": system or "You are a helpful assistant."},
|
| 102 |
+
{"role": "user", "content": prompt}])
|
| 103 |
+
return (resp.choices[0].message.content or "").strip()
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def get_provider(name: Optional[str], model: str = "") -> Optional[Provider]:
|
| 107 |
+
if not name or name == "none":
|
| 108 |
+
return None
|
| 109 |
+
if name == "ollama":
|
| 110 |
+
return OllamaProvider(model or "llama3.2")
|
| 111 |
+
if name == "anthropic":
|
| 112 |
+
return AnthropicProvider(model or "claude-haiku-4-5-20251001")
|
| 113 |
+
if name == "openai":
|
| 114 |
+
return OpenAIProvider(model or "gpt-4o-mini")
|
| 115 |
+
raise SynthkitError(f"unknown provider {name!r}")
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
# ---- embedders (for the optional semantic axis) ------------------------------
|
| 119 |
+
|
| 120 |
+
class Embedder:
|
| 121 |
+
name = "base"
|
| 122 |
+
|
| 123 |
+
def embed(self, texts: List[str]) -> List[List[float]]:
|
| 124 |
+
raise NotImplementedError
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
class OllamaEmbedder(Embedder):
|
| 128 |
+
name = "ollama"
|
| 129 |
+
|
| 130 |
+
def __init__(self, model: str = "nomic-embed-text", host: str = "") -> None:
|
| 131 |
+
self.model = model
|
| 132 |
+
self.host = (host or os.environ.get("OLLAMA_HOST", "http://localhost:11434")).rstrip("/")
|
| 133 |
+
|
| 134 |
+
def embed(self, texts: List[str]) -> List[List[float]]:
|
| 135 |
+
out: List[List[float]] = []
|
| 136 |
+
for t in texts:
|
| 137 |
+
data = _post_json(self.host + "/api/embeddings",
|
| 138 |
+
{"model": self.model, "prompt": t})
|
| 139 |
+
vec = data.get("embedding")
|
| 140 |
+
if not vec:
|
| 141 |
+
raise SynthkitError(f"Ollama embedder returned no vector for model {self.model!r}")
|
| 142 |
+
out.append(vec)
|
| 143 |
+
return out
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class OpenAIEmbedder(Embedder):
|
| 147 |
+
name = "openai"
|
| 148 |
+
|
| 149 |
+
def __init__(self, model: str = "text-embedding-3-small") -> None:
|
| 150 |
+
try:
|
| 151 |
+
import openai
|
| 152 |
+
except ImportError as exc:
|
| 153 |
+
raise SynthkitError("--embed-provider openai needs the openai SDK. "
|
| 154 |
+
"`pip install openai`.") from exc
|
| 155 |
+
self.model = model
|
| 156 |
+
self._client = openai.OpenAI()
|
| 157 |
+
|
| 158 |
+
def embed(self, texts: List[str]) -> List[List[float]]:
|
| 159 |
+
resp = self._client.embeddings.create(model=self.model, input=texts)
|
| 160 |
+
return [d.embedding for d in resp.data]
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def get_embedder(name: Optional[str], model: str = "") -> Optional[Embedder]:
|
| 164 |
+
if not name or name == "none":
|
| 165 |
+
return None
|
| 166 |
+
if name == "ollama":
|
| 167 |
+
return OllamaEmbedder(model or "nomic-embed-text")
|
| 168 |
+
if name == "openai":
|
| 169 |
+
return OpenAIEmbedder(model or "text-embedding-3-small")
|
| 170 |
+
raise SynthkitError(f"unknown embed provider {name!r}")
|
synthkit/report.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Render a GradeReport: terminal, JSON, and a standalone HTML report."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import html
|
| 5 |
+
import json
|
| 6 |
+
from typing import List
|
| 7 |
+
|
| 8 |
+
from synthkit import __version__
|
| 9 |
+
from synthkit.models import GradeReport, to_grade
|
| 10 |
+
|
| 11 |
+
# ---- ANSI helpers ------------------------------------------------------------
|
| 12 |
+
_C = {
|
| 13 |
+
"reset": "\033[0m", "bold": "\033[1m", "dim": "\033[2m",
|
| 14 |
+
"red": "\033[31m", "green": "\033[32m", "yellow": "\033[33m",
|
| 15 |
+
"blue": "\033[34m", "magenta": "\033[35m", "cyan": "\033[36m",
|
| 16 |
+
}
|
| 17 |
+
GRADE_COLOR = {"A+": "green", "A": "green", "B": "cyan", "C": "yellow", "D": "yellow", "F": "red"}
|
| 18 |
+
GRADE_HEX = {"A+": "#16a34a", "A": "#16a34a", "B": "#0891b2", "C": "#ca8a04",
|
| 19 |
+
"D": "#ea580c", "F": "#dc2626"}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _c(text: str, color: str) -> str:
|
| 23 |
+
return f"{_C.get(color, '')}{text}{_C['reset']}"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _bar(score: float, width: int = 21) -> str:
|
| 27 |
+
fill = int(round(score / 100 * width))
|
| 28 |
+
return "█" * fill + "░" * (width - fill)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _score_color(score: float) -> str:
|
| 32 |
+
return GRADE_COLOR.get(to_grade(score), "yellow")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _hex(score: float) -> str:
|
| 36 |
+
return GRADE_HEX.get(to_grade(score), "#ca8a04")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def print_report(report: GradeReport, dataset: str = "", use_color: bool = True) -> None:
|
| 40 |
+
def col(t, c):
|
| 41 |
+
return _c(t, c) if use_color else t
|
| 42 |
+
|
| 43 |
+
g = report.grade
|
| 44 |
+
print()
|
| 45 |
+
print(col(" ┌─ synthkit · data quality ──────────────────────────", "dim"))
|
| 46 |
+
print(" │")
|
| 47 |
+
print(f" │ Quality grade {col(g, GRADE_COLOR.get(g, 'yellow'))} ({report.score}/100)")
|
| 48 |
+
print(f" │ Records {report.n_records}")
|
| 49 |
+
if dataset:
|
| 50 |
+
print(f" │ Dataset {dataset}")
|
| 51 |
+
print(" │")
|
| 52 |
+
print(col(" └────────────────────────────────────────────────────", "dim"))
|
| 53 |
+
|
| 54 |
+
print(f"\n {col('DIMENSIONS', 'bold')}\n")
|
| 55 |
+
for d in report.dimensions:
|
| 56 |
+
if d.applicable:
|
| 57 |
+
mark = col("●", _score_color(d.score))
|
| 58 |
+
bar = col(_bar(d.score), _score_color(d.score))
|
| 59 |
+
print(f" {mark} {d.title:<14}{d.score:>5.0f} {bar} {col(d.summary, 'dim')}")
|
| 60 |
+
else:
|
| 61 |
+
print(f" {col('○', 'dim')} {d.title:<14}{col(' n/a', 'dim')} {col(d.summary, 'dim')}")
|
| 62 |
+
|
| 63 |
+
notes = [(d.title, f) for d in report.dimensions for f in d.findings]
|
| 64 |
+
if notes:
|
| 65 |
+
print(f"\n {col('NOTES', 'bold')}\n")
|
| 66 |
+
for title, f in notes:
|
| 67 |
+
print(f" {col('▸ ' + title + ':', 'cyan')} {f}")
|
| 68 |
+
|
| 69 |
+
print(f"\n {col('Grade = weighted blend of the applicable axes · tune with --against / --field', 'dim')}")
|
| 70 |
+
print()
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def to_json(report: GradeReport, dataset: str = "") -> str:
|
| 74 |
+
return json.dumps({
|
| 75 |
+
"dataset": dataset,
|
| 76 |
+
"grade": report.grade,
|
| 77 |
+
"score": report.score,
|
| 78 |
+
"records": report.n_records,
|
| 79 |
+
"dimensions": [
|
| 80 |
+
{"key": d.key, "title": d.title, "score": d.score,
|
| 81 |
+
"summary": d.summary, "findings": d.findings, "stats": d.stats}
|
| 82 |
+
for d in report.dimensions
|
| 83 |
+
],
|
| 84 |
+
"meta": report.meta,
|
| 85 |
+
}, indent=2)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def to_html(report: GradeReport, dataset: str = "") -> str:
|
| 89 |
+
gcolor = GRADE_HEX.get(report.grade, "#ca8a04")
|
| 90 |
+
|
| 91 |
+
rows: List[str] = []
|
| 92 |
+
for d in report.dimensions:
|
| 93 |
+
if d.applicable:
|
| 94 |
+
bc = _hex(d.score)
|
| 95 |
+
rows.append(f"""
|
| 96 |
+
<div class="dim">
|
| 97 |
+
<div class="dim-head">
|
| 98 |
+
<span class="dt">{html.escape(d.title)}</span>
|
| 99 |
+
<span class="ds" style="color:{bc}">{d.score:.0f}</span>
|
| 100 |
+
</div>
|
| 101 |
+
<div class="track"><div class="fill" style="width:{d.score:.0f}%;background:{bc}"></div></div>
|
| 102 |
+
<div class="dsum">{html.escape(d.summary)}</div>
|
| 103 |
+
</div>""")
|
| 104 |
+
else:
|
| 105 |
+
rows.append(f"""
|
| 106 |
+
<div class="dim">
|
| 107 |
+
<div class="dim-head">
|
| 108 |
+
<span class="dt">{html.escape(d.title)}</span>
|
| 109 |
+
<span class="ds na">n/a</span>
|
| 110 |
+
</div>
|
| 111 |
+
<div class="dsum">{html.escape(d.summary)}</div>
|
| 112 |
+
</div>""")
|
| 113 |
+
|
| 114 |
+
notes = [f'<li><b>{html.escape(d.title)}:</b> {html.escape(f)}</li>'
|
| 115 |
+
for d in report.dimensions for f in d.findings]
|
| 116 |
+
|
| 117 |
+
return f"""<!doctype html><html lang="en"><head><meta charset="utf-8">
|
| 118 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 119 |
+
<title>synthkit report</title>
|
| 120 |
+
<style>
|
| 121 |
+
:root {{ color-scheme: light dark; }}
|
| 122 |
+
body {{ font: 15px/1.55 -apple-system, Segoe UI, Roboto, sans-serif; margin: 0;
|
| 123 |
+
background: #0b1020; color: #e5e7eb; }}
|
| 124 |
+
.wrap {{ max-width: 760px; margin: 0 auto; padding: 40px 24px 80px; }}
|
| 125 |
+
h1 {{ font-size: 20px; letter-spacing: .3px; margin: 0 0 4px; }}
|
| 126 |
+
.sub {{ color: #94a3b8; margin: 0 0 28px; font-size: 13px; }}
|
| 127 |
+
.hero {{ display: flex; gap: 24px; align-items: center; background: #111934;
|
| 128 |
+
border: 1px solid #1e293b; border-radius: 14px; padding: 24px; margin-bottom: 28px; }}
|
| 129 |
+
.grade {{ font-size: 56px; font-weight: 800; line-height: 1; color: {gcolor}; }}
|
| 130 |
+
.meta {{ flex: 1; }}
|
| 131 |
+
.meta .big {{ font-size: 15px; margin-bottom: 6px; }}
|
| 132 |
+
.meta .small {{ color: #94a3b8; font-size: 13px; }}
|
| 133 |
+
h2 {{ font-size: 13px; text-transform: uppercase; letter-spacing: 1px; color: #94a3b8;
|
| 134 |
+
margin: 32px 0 14px; }}
|
| 135 |
+
.dim {{ background: #111934; border: 1px solid #1e293b; border-radius: 12px;
|
| 136 |
+
padding: 14px 18px; margin-bottom: 10px; }}
|
| 137 |
+
.dim-head {{ display: flex; justify-content: space-between; align-items: baseline; }}
|
| 138 |
+
.dt {{ font-weight: 700; }}
|
| 139 |
+
.ds {{ font-size: 22px; font-weight: 800; }}
|
| 140 |
+
.ds.na {{ color: #64748b; font-size: 15px; font-weight: 600; }}
|
| 141 |
+
.track {{ height: 8px; background: #0b1020; border-radius: 999px; margin: 10px 0 8px; overflow: hidden; }}
|
| 142 |
+
.fill {{ height: 100%; border-radius: 999px; }}
|
| 143 |
+
.dsum {{ color: #cbd5e1; font-size: 13px; }}
|
| 144 |
+
ul.notes {{ list-style: none; padding: 0; margin: 0; }}
|
| 145 |
+
ul.notes li {{ background: #0e1830; border: 1px solid #1e293b; border-left: 3px solid #38bdf8;
|
| 146 |
+
border-radius: 8px; padding: 9px 14px; margin-bottom: 8px; font-size: 13.5px; }}
|
| 147 |
+
ul.notes b {{ color: #818cf8; }}
|
| 148 |
+
.foot {{ margin-top: 36px; color: #64748b; font-size: 12px; }}
|
| 149 |
+
</style></head><body><div class="wrap">
|
| 150 |
+
<h1>synthkit: synthetic data quality report</h1>
|
| 151 |
+
<p class="sub">validity · uniqueness · diversity · contamination</p>
|
| 152 |
+
<div class="hero">
|
| 153 |
+
<div class="grade">{report.grade}</div>
|
| 154 |
+
<div class="meta">
|
| 155 |
+
<div class="big">Quality score <b>{report.score}/100</b></div>
|
| 156 |
+
<div class="small">{report.n_records} records{(' · ' + html.escape(dataset)) if dataset else ''}</div>
|
| 157 |
+
</div>
|
| 158 |
+
</div>
|
| 159 |
+
<h2>Dimensions</h2>
|
| 160 |
+
{''.join(rows)}
|
| 161 |
+
<h2>Notes</h2>
|
| 162 |
+
{('<ul class="notes">' + ''.join(notes) + '</ul>') if notes else '<p style="color:#86efac">Nothing flagged.</p>'}
|
| 163 |
+
<p class="foot">Generated by synthkit v{__version__} · grade is a weighted blend of the applicable axes.</p>
|
| 164 |
+
</div></body></html>"""
|
synthkit/tabular/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Product B: schema-aware tabular fixtures with referential integrity. [roadmap]
|
| 2 |
+
|
| 3 |
+
Slots into the same core as Product A: it will emit records that the shared
|
| 4 |
+
grading engine (validity / uniqueness / diversity, plus tabular-specific
|
| 5 |
+
referential-integrity and PII-safety checks) scores with the same report.
|
| 6 |
+
"""
|
synthkit/text/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Product A: synthetic instruction & evaluation datasets for LLMs."""
|
synthkit/text/generate.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generate text records (eval prompts or instruction→output pairs) from a seed spec.
|
| 2 |
+
|
| 3 |
+
Two phases:
|
| 4 |
+
1. sample prompts: deterministic for a fixed seed; exact-duplicate prompts are
|
| 5 |
+
skipped by default so naive slot collisions don't pad the dataset.
|
| 6 |
+
2. fill responses: only for instruction data with response.mode == 'provider';
|
| 7 |
+
runs concurrently with a progress callback.
|
| 8 |
+
|
| 9 |
+
Optional: pass a `dedup_embedder` to dedup *by meaning as you generate*, each
|
| 10 |
+
candidate is embedded and rejected if it's within `dedup_threshold` cosine of an
|
| 11 |
+
already-accepted record.
|
| 12 |
+
|
| 13 |
+
Templates are rendered with a safe `{slot}`-only substitution (NOT str.format):
|
| 14 |
+
attribute/index access and format specs are treated as literal text, so an
|
| 15 |
+
untrusted template can't reach object internals or trigger a format-spec blow-up.
|
| 16 |
+
"""
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import random
|
| 20 |
+
import re
|
| 21 |
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
| 22 |
+
|
| 23 |
+
from synthkit.grading import record_text
|
| 24 |
+
from synthkit.models import SynthkitError
|
| 25 |
+
from synthkit.providers import Embedder, Provider
|
| 26 |
+
from synthkit.util import max_cosine, pmap, unit
|
| 27 |
+
|
| 28 |
+
_PLACEHOLDER = re.compile(r"\{(\w+)\}")
|
| 29 |
+
_MAX_RECORD_CHARS = 100_000 # guards against a pathological slot value
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def render(template: str, fill: Dict[str, str]) -> str:
|
| 33 |
+
"""Substitute only bare ``{slot}`` placeholders; everything else stays literal."""
|
| 34 |
+
def repl(match: "re.Match[str]") -> str:
|
| 35 |
+
key = match.group(1)
|
| 36 |
+
if key not in fill:
|
| 37 |
+
raise SynthkitError(f"template slot '{key}' is missing from 'slots'")
|
| 38 |
+
return str(fill[key])
|
| 39 |
+
|
| 40 |
+
out = _PLACEHOLDER.sub(repl, template)
|
| 41 |
+
if len(out) > _MAX_RECORD_CHARS:
|
| 42 |
+
raise SynthkitError(f"rendered record exceeds {_MAX_RECORD_CHARS} characters")
|
| 43 |
+
return out
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _rule_response(resp_cfg: Dict[str, Any], fill: Dict[str, str]) -> str:
|
| 47 |
+
return render(str(resp_cfg.get("template", "")), fill)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _shape_record(kind: str, prompt: str, response: str,
|
| 51 |
+
system: str, domain: str) -> Dict[str, Any]:
|
| 52 |
+
if kind == "instruction":
|
| 53 |
+
rec: Dict[str, Any] = {"instruction": prompt, "input": "", "output": response}
|
| 54 |
+
if system:
|
| 55 |
+
rec["system"] = system
|
| 56 |
+
else:
|
| 57 |
+
rec = {"prompt": prompt}
|
| 58 |
+
if domain:
|
| 59 |
+
rec["domain"] = domain
|
| 60 |
+
return rec
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _response_for(kind: str, mode: str, resp_cfg: Dict[str, Any], fill: Dict[str, str],
|
| 64 |
+
prompt: str, system: str, provider: Optional[Provider]) -> str:
|
| 65 |
+
if kind != "instruction":
|
| 66 |
+
return ""
|
| 67 |
+
if mode == "none":
|
| 68 |
+
return ""
|
| 69 |
+
if mode == "rule":
|
| 70 |
+
return _rule_response(resp_cfg, fill)
|
| 71 |
+
if mode == "provider":
|
| 72 |
+
return provider.generate(prompt, system) # type: ignore[union-attr]
|
| 73 |
+
raise SynthkitError(f"unknown response.mode {mode!r}")
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def sample_prompts(spec: Dict[str, Any], n: int, *, seed: int = 17,
|
| 77 |
+
dedup: bool = True,
|
| 78 |
+
max_attempts: Optional[int] = None
|
| 79 |
+
) -> List[Tuple[str, Dict[str, str]]]:
|
| 80 |
+
"""Phase 1: return up to n (prompt, slot-fill) pairs."""
|
| 81 |
+
templates = spec.get("templates") or []
|
| 82 |
+
if not templates:
|
| 83 |
+
raise SynthkitError("seed spec has no 'templates'")
|
| 84 |
+
slots: Dict[str, List[str]] = spec.get("slots") or {}
|
| 85 |
+
min_words = int((spec.get("constraints") or {}).get("min_words", 0))
|
| 86 |
+
|
| 87 |
+
rng = random.Random(seed)
|
| 88 |
+
out: List[Tuple[str, Dict[str, str]]] = []
|
| 89 |
+
seen: set = set()
|
| 90 |
+
attempts = 0
|
| 91 |
+
cap = max_attempts if max_attempts is not None else max(n * 50, 200)
|
| 92 |
+
while len(out) < n and attempts < cap:
|
| 93 |
+
attempts += 1
|
| 94 |
+
template = rng.choice(templates)
|
| 95 |
+
fill = {k: rng.choice(v) for k, v in slots.items()}
|
| 96 |
+
prompt = render(template, fill)
|
| 97 |
+
if len(prompt.split()) < min_words:
|
| 98 |
+
continue
|
| 99 |
+
if dedup:
|
| 100 |
+
key = " ".join(prompt.lower().split())
|
| 101 |
+
if key in seen:
|
| 102 |
+
continue
|
| 103 |
+
seen.add(key)
|
| 104 |
+
out.append((prompt, fill))
|
| 105 |
+
return out
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _generate_dedup(spec, n, provider, seed, dedup, progress,
|
| 109 |
+
embedder: Embedder, threshold: float,
|
| 110 |
+
stats: Optional[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 111 |
+
kind = spec.get("kind", "eval")
|
| 112 |
+
system = spec.get("system", "")
|
| 113 |
+
domain = spec.get("domain", "")
|
| 114 |
+
resp_cfg = spec.get("response") or {"mode": "none"}
|
| 115 |
+
mode = resp_cfg.get("mode", "none")
|
| 116 |
+
|
| 117 |
+
pool = sample_prompts(spec, max(n * 5, n + 100), seed=seed, dedup=dedup)
|
| 118 |
+
records: List[Dict[str, Any]] = []
|
| 119 |
+
units: List[List[float]] = []
|
| 120 |
+
rejected = attempts = 0
|
| 121 |
+
for prompt, fill in pool:
|
| 122 |
+
if len(records) >= n:
|
| 123 |
+
break
|
| 124 |
+
attempts += 1
|
| 125 |
+
resp = _response_for(kind, mode, resp_cfg, fill, prompt, system, provider)
|
| 126 |
+
rec = _shape_record(kind, prompt, resp, system, domain)
|
| 127 |
+
u = unit(embedder.embed([record_text(rec, None)])[0])
|
| 128 |
+
if units and max_cosine(u, units) >= threshold:
|
| 129 |
+
rejected += 1
|
| 130 |
+
else:
|
| 131 |
+
records.append(rec)
|
| 132 |
+
units.append(u)
|
| 133 |
+
if progress:
|
| 134 |
+
progress(len(records), n)
|
| 135 |
+
if stats is not None:
|
| 136 |
+
stats["rejected_semantic"] = rejected
|
| 137 |
+
stats["attempts"] = attempts
|
| 138 |
+
stats["pool"] = len(pool)
|
| 139 |
+
return records
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def generate(spec: Dict[str, Any], n: int, *, provider: Optional[Provider] = None,
|
| 143 |
+
seed: int = 17, dedup: bool = True,
|
| 144 |
+
max_attempts: Optional[int] = None, concurrency: int = 1,
|
| 145 |
+
progress: Optional[Callable[[int, int], None]] = None,
|
| 146 |
+
dedup_embedder: Optional[Embedder] = None,
|
| 147 |
+
dedup_threshold: float = 0.9,
|
| 148 |
+
stats: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
| 149 |
+
kind = spec.get("kind", "eval")
|
| 150 |
+
system = spec.get("system", "")
|
| 151 |
+
domain = spec.get("domain", "")
|
| 152 |
+
resp_cfg = spec.get("response") or {"mode": "none"}
|
| 153 |
+
mode = resp_cfg.get("mode", "none")
|
| 154 |
+
|
| 155 |
+
if mode == "provider" and provider is None:
|
| 156 |
+
raise SynthkitError("response.mode is 'provider' but no provider was selected "
|
| 157 |
+
"(pass --provider ollama|anthropic|openai)")
|
| 158 |
+
|
| 159 |
+
if dedup_embedder is not None:
|
| 160 |
+
return _generate_dedup(spec, n, provider, seed, dedup, progress,
|
| 161 |
+
dedup_embedder, dedup_threshold, stats)
|
| 162 |
+
|
| 163 |
+
prompts = sample_prompts(spec, n, seed=seed, dedup=dedup, max_attempts=max_attempts)
|
| 164 |
+
responses: List[str] = [""] * len(prompts)
|
| 165 |
+
if kind == "instruction" and mode != "none":
|
| 166 |
+
if mode == "rule":
|
| 167 |
+
responses = [_rule_response(resp_cfg, fill) for _, fill in prompts]
|
| 168 |
+
elif mode == "provider":
|
| 169 |
+
responses = pmap(lambda pf: provider.generate(pf[0], system),
|
| 170 |
+
prompts, concurrency=concurrency, progress=progress)
|
| 171 |
+
else:
|
| 172 |
+
raise SynthkitError(f"unknown response.mode {mode!r}")
|
| 173 |
+
|
| 174 |
+
return [_shape_record(kind, prompt, resp, system, domain)
|
| 175 |
+
for (prompt, _fill), resp in zip(prompts, responses)]
|
synthkit/text/seeds.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Built-in seed specs used by `synthkit text gen --demo`.
|
| 2 |
+
|
| 3 |
+
A seed spec is plain data (works as JSON or YAML on disk too):
|
| 4 |
+
|
| 5 |
+
kind "eval" (prompt-only) or "instruction" (instruction→output)
|
| 6 |
+
templates sentence templates with {slot} placeholders
|
| 7 |
+
slots lists of fillers, one per placeholder name
|
| 8 |
+
response how to fill the output for instruction data:
|
| 9 |
+
{"mode": "none"} leave blank (eval sets)
|
| 10 |
+
{"mode": "rule", "template": ...} fill a string template (offline)
|
| 11 |
+
{"mode": "provider"} call an LLM provider
|
| 12 |
+
constraints optional, e.g. {"min_words": 4}
|
| 13 |
+
"""
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
_LANGS = ["Python", "JavaScript", "Rust", "Go", "TypeScript", "Java", "C++", "Ruby"]
|
| 17 |
+
_TASKS = [
|
| 18 |
+
"reverse a string",
|
| 19 |
+
"check whether a number is prime",
|
| 20 |
+
"merge two sorted lists",
|
| 21 |
+
"find the longest common subsequence of two strings",
|
| 22 |
+
"parse an ISO-8601 date",
|
| 23 |
+
"debounce a function",
|
| 24 |
+
"flatten a deeply nested list",
|
| 25 |
+
"compute a moving average over a stream",
|
| 26 |
+
"detect a cycle in a linked list",
|
| 27 |
+
"implement binary search",
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
DEMO_EVAL = {
|
| 31 |
+
"kind": "eval",
|
| 32 |
+
"domain": "coding",
|
| 33 |
+
"templates": [
|
| 34 |
+
"Write a {language} function that {task}.",
|
| 35 |
+
"How would you {task} in {language}? Walk through your reasoning.",
|
| 36 |
+
"Review this {language} snippet that is meant to {task} and point out the bugs.",
|
| 37 |
+
"Explain to a beginner how to {task} using {language}.",
|
| 38 |
+
"What's the most efficient way to {task} in {language}, and why?",
|
| 39 |
+
"Refactor a {language} program that {task} to be more readable.",
|
| 40 |
+
],
|
| 41 |
+
"slots": {"language": _LANGS, "task": _TASKS},
|
| 42 |
+
"constraints": {"min_words": 4},
|
| 43 |
+
"response": {"mode": "none"},
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
DEMO_INSTRUCTION = {
|
| 47 |
+
"kind": "instruction",
|
| 48 |
+
"domain": "coding",
|
| 49 |
+
"system": "You are a precise, helpful coding assistant.",
|
| 50 |
+
"templates": [
|
| 51 |
+
"Write a {language} function that {task}.",
|
| 52 |
+
"Show me how to {task} in {language}.",
|
| 53 |
+
"I need {language} code to {task}. Include a short explanation.",
|
| 54 |
+
],
|
| 55 |
+
"slots": {"language": _LANGS, "task": _TASKS},
|
| 56 |
+
"constraints": {"min_words": 4},
|
| 57 |
+
"response": {
|
| 58 |
+
"mode": "rule",
|
| 59 |
+
"template": "Here's an approach in {language} to {task}: start by clarifying "
|
| 60 |
+
"the inputs and edge cases, then implement the core logic and test it.",
|
| 61 |
+
},
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
BUILTIN_SEEDS = {"eval": DEMO_EVAL, "instruction": DEMO_INSTRUCTION}
|
synthkit/util.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Small shared utilities."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import math
|
| 5 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 6 |
+
from typing import Callable, Iterable, List, Optional, Sequence, TypeVar
|
| 7 |
+
|
| 8 |
+
T = TypeVar("T")
|
| 9 |
+
R = TypeVar("R")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def pmap(fn: Callable[[T], R], items: Iterable[T], concurrency: int = 1,
|
| 13 |
+
progress: Optional[Callable[[int, int], None]] = None) -> List[R]:
|
| 14 |
+
"""Map fn over items, optionally across threads, preserving input order.
|
| 15 |
+
|
| 16 |
+
Exceptions propagate (the first one raised wins). `progress(done, total)`
|
| 17 |
+
is called after each item completes.
|
| 18 |
+
"""
|
| 19 |
+
items = list(items)
|
| 20 |
+
total = len(items)
|
| 21 |
+
results: List[Optional[R]] = [None] * total
|
| 22 |
+
if concurrency <= 1:
|
| 23 |
+
for i, it in enumerate(items):
|
| 24 |
+
results[i] = fn(it)
|
| 25 |
+
if progress:
|
| 26 |
+
progress(i + 1, total)
|
| 27 |
+
return results # type: ignore[return-value]
|
| 28 |
+
|
| 29 |
+
done = 0
|
| 30 |
+
with ThreadPoolExecutor(max_workers=concurrency) as ex:
|
| 31 |
+
futs = {ex.submit(fn, it): i for i, it in enumerate(items)}
|
| 32 |
+
for fut in as_completed(futs):
|
| 33 |
+
results[futs[fut]] = fut.result()
|
| 34 |
+
done += 1
|
| 35 |
+
if progress:
|
| 36 |
+
progress(done, total)
|
| 37 |
+
return results # type: ignore[return-value]
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def unit(vec: Sequence[float]) -> List[float]:
|
| 41 |
+
"""L2-normalize a vector (a zero vector maps to itself)."""
|
| 42 |
+
norm = math.sqrt(sum(x * x for x in vec)) or 1.0
|
| 43 |
+
return [x / norm for x in vec]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def max_cosine(u: Sequence[float], units: Sequence[Sequence[float]]) -> float:
|
| 47 |
+
"""Max cosine similarity between unit vector u and a list of unit vectors."""
|
| 48 |
+
best = 0.0
|
| 49 |
+
for w in units:
|
| 50 |
+
s = 0.0
|
| 51 |
+
for a, b in zip(u, w):
|
| 52 |
+
s += a * b
|
| 53 |
+
if s > best:
|
| 54 |
+
best = s
|
| 55 |
+
return best
|