synthkit Space v0.3.0 — grade + offline generate demo
Browse files- README.md +17 -7
- app.py +125 -0
- requirements.txt +1 -0
README.md
CHANGED
|
@@ -1,13 +1,23 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
python_version: '3.13'
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: synthkit
|
| 3 |
+
emoji: 🧪
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# synthkit — synthetic data, graded
|
| 14 |
+
|
| 15 |
+
Generate synthetic LLM data and **grade it** on validity, uniqueness, diversity,
|
| 16 |
+
and contamination, with an A+→F headline. This Space is a live, offline demo
|
| 17 |
+
(template generation + lexical grading).
|
| 18 |
+
|
| 19 |
+
The full command-line tool adds LLM-backed instruction→output generation,
|
| 20 |
+
fine-tuning output formats (alpaca / sharegpt / openai), and an embedding-based
|
| 21 |
+
semantic-dedup axis that catches paraphrase duplicates lexical methods miss.
|
| 22 |
+
|
| 23 |
+
👉 **Source & docs:** https://github.com/LaelaZorana/synthkit
|
app.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""synthkit — Hugging Face Space (Gradio).
|
| 2 |
+
|
| 3 |
+
A small live demo of the synthkit quality grader and offline generator. This
|
| 4 |
+
Space runs fully offline: template-based generation (no API key) and the lexical
|
| 5 |
+
grading axes. The CLI adds LLM-backed generation, fine-tuning output formats, and
|
| 6 |
+
an embedding-based semantic-dedup axis. Source: https://github.com/LaelaZorana/synthkit
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import html
|
| 11 |
+
import json
|
| 12 |
+
|
| 13 |
+
import gradio as gr
|
| 14 |
+
|
| 15 |
+
from synthkit import __version__
|
| 16 |
+
from synthkit.grading import grade_dataset
|
| 17 |
+
from synthkit.report import to_html, to_json
|
| 18 |
+
from synthkit.text.generate import generate
|
| 19 |
+
from synthkit.text.seeds import BUILTIN_SEEDS
|
| 20 |
+
|
| 21 |
+
GH = "https://github.com/LaelaZorana/synthkit"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _records_from(text: str, file) -> list:
|
| 25 |
+
raw = ""
|
| 26 |
+
if file is not None:
|
| 27 |
+
path = file if isinstance(file, str) else getattr(file, "name", None)
|
| 28 |
+
with open(path, "r", encoding="utf-8") as fh:
|
| 29 |
+
raw = fh.read()
|
| 30 |
+
elif text and text.strip():
|
| 31 |
+
raw = text
|
| 32 |
+
raw = raw.strip()
|
| 33 |
+
if not raw:
|
| 34 |
+
raise ValueError("Paste some JSONL/JSON or upload a file first.")
|
| 35 |
+
if raw[0] == "[":
|
| 36 |
+
recs = json.loads(raw)
|
| 37 |
+
else:
|
| 38 |
+
recs = [json.loads(line) for line in raw.splitlines() if line.strip()]
|
| 39 |
+
if not isinstance(recs, list) or not recs:
|
| 40 |
+
raise ValueError("No records found.")
|
| 41 |
+
return recs
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _iframe(report_html: str) -> str:
|
| 45 |
+
return (f'<iframe srcdoc="{html.escape(report_html, quote=True)}" '
|
| 46 |
+
'style="width:100%;height:760px;border:0;border-radius:14px;background:#0b1020">'
|
| 47 |
+
'</iframe>')
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _err(exc: Exception) -> str:
|
| 51 |
+
return f'<p style="color:#dc2626;font:14px sans-serif">⚠ {html.escape(str(exc))}</p>'
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def grade_ui(dataset_text, dataset_file, eval_text):
|
| 55 |
+
try:
|
| 56 |
+
records = _records_from(dataset_text, dataset_file)
|
| 57 |
+
against = None
|
| 58 |
+
if eval_text and eval_text.strip():
|
| 59 |
+
against = [json.loads(l) for l in eval_text.splitlines() if l.strip()]
|
| 60 |
+
report = grade_dataset(records, against=against)
|
| 61 |
+
return _iframe(to_html(report, "uploaded dataset")), to_json(report, "uploaded dataset")
|
| 62 |
+
except Exception as exc: # noqa: BLE001 - surface any parse/grade error to the UI
|
| 63 |
+
return _err(exc), ""
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def generate_ui(seed_choice, seed_text, n):
|
| 67 |
+
try:
|
| 68 |
+
spec = json.loads(seed_text) if seed_text and seed_text.strip() else BUILTIN_SEEDS[seed_choice]
|
| 69 |
+
data = generate(spec, int(n), seed=17)
|
| 70 |
+
report = grade_dataset(data)
|
| 71 |
+
jsonl = "\n".join(json.dumps(r, ensure_ascii=False) for r in data)
|
| 72 |
+
return _iframe(to_html(report, "generated dataset")), jsonl
|
| 73 |
+
except Exception as exc: # noqa: BLE001
|
| 74 |
+
return _err(exc), ""
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
with gr.Blocks(title="synthkit", theme=gr.themes.Soft(), css="footer{visibility:hidden}") as demo:
|
| 78 |
+
gr.Markdown(
|
| 79 |
+
"# 🧪 synthkit\n"
|
| 80 |
+
"**Generate synthetic data — and grade it before you train on it.** "
|
| 81 |
+
"Scored on validity · uniqueness · diversity · contamination, with an A+→F headline.\n\n"
|
| 82 |
+
f"This Space runs offline (template generation + lexical grading). The full "
|
| 83 |
+
f"[CLI]({GH}) adds LLM-backed instruction data, fine-tuning formats "
|
| 84 |
+
f"(alpaca/sharegpt/openai), and an embedding semantic-dedup axis. · v{__version__}")
|
| 85 |
+
|
| 86 |
+
with gr.Tab("Grade a dataset"):
|
| 87 |
+
gr.Markdown("Upload or paste a `.jsonl` (one JSON record per line). Optionally "
|
| 88 |
+
"paste a held-out eval set to check **contamination** (train/eval leakage).")
|
| 89 |
+
with gr.Row():
|
| 90 |
+
with gr.Column():
|
| 91 |
+
g_file = gr.File(label="Dataset (.jsonl / .json)",
|
| 92 |
+
file_types=[".jsonl", ".json", ".txt"])
|
| 93 |
+
g_text = gr.Textbox(label="…or paste records", lines=8,
|
| 94 |
+
placeholder='{"prompt": "Explain gradient descent"}\n'
|
| 95 |
+
'{"prompt": "What is a hash map?"}')
|
| 96 |
+
g_eval = gr.Textbox(label="Held-out eval set for contamination (optional)",
|
| 97 |
+
lines=3, placeholder='{"prompt": "Explain gradient descent"}')
|
| 98 |
+
g_btn = gr.Button("Grade", variant="primary")
|
| 99 |
+
with gr.Column():
|
| 100 |
+
g_html = gr.HTML()
|
| 101 |
+
g_json = gr.Code(label="report.json", language="json")
|
| 102 |
+
g_btn.click(grade_ui, [g_text, g_file, g_eval], [g_html, g_json])
|
| 103 |
+
|
| 104 |
+
with gr.Tab("Generate (offline)"):
|
| 105 |
+
gr.Markdown("Generate eval prompts from a built-in seed, or paste your own seed "
|
| 106 |
+
"spec (templates + slots). The result is graded automatically.")
|
| 107 |
+
with gr.Row():
|
| 108 |
+
with gr.Column():
|
| 109 |
+
s_choice = gr.Dropdown(list(BUILTIN_SEEDS), value="eval", label="Built-in seed")
|
| 110 |
+
s_n = gr.Slider(10, 300, value=120, step=10, label="How many records")
|
| 111 |
+
s_text = gr.Textbox(
|
| 112 |
+
label="…or paste a seed spec (JSON)", lines=8,
|
| 113 |
+
placeholder='{"kind":"eval","templates":["{a} about {b}"],'
|
| 114 |
+
'"slots":{"a":["Explain","Summarize"],"b":["TLS","DNS","OAuth"]},'
|
| 115 |
+
'"response":{"mode":"none"}}')
|
| 116 |
+
s_btn = gr.Button("Generate + grade", variant="primary")
|
| 117 |
+
with gr.Column():
|
| 118 |
+
s_html = gr.HTML()
|
| 119 |
+
s_jsonl = gr.Code(label="dataset.jsonl")
|
| 120 |
+
s_btn.click(generate_ui, [s_choice, s_text, s_n], [s_html, s_jsonl])
|
| 121 |
+
|
| 122 |
+
gr.Markdown(f"[⭐ Source on GitHub]({GH}) · MIT-licensed · zero-dependency core")
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
git+https://github.com/LaelaZorana/synthkit.git
|