LaelaZ commited on
Commit
1b1d946
·
verified ·
1 Parent(s): 23dc4b1

synthkit Space v0.3.0 — grade + offline generate demo

Browse files
Files changed (3) hide show
  1. README.md +17 -7
  2. app.py +125 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -1,13 +1,23 @@
1
  ---
2
- title: Synthkit
3
- emoji: 🦀
4
- colorFrom: purple
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.16.0
8
- python_version: '3.13'
9
  app_file: app.py
10
  pinned: false
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: synthkit
3
+ emoji: 🧪
4
+ colorFrom: indigo
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.44.0
 
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # synthkit synthetic data, graded
14
+
15
+ Generate synthetic LLM data and **grade it** on validity, uniqueness, diversity,
16
+ and contamination, with an A+→F headline. This Space is a live, offline demo
17
+ (template generation + lexical grading).
18
+
19
+ The full command-line tool adds LLM-backed instruction→output generation,
20
+ fine-tuning output formats (alpaca / sharegpt / openai), and an embedding-based
21
+ semantic-dedup axis that catches paraphrase duplicates lexical methods miss.
22
+
23
+ 👉 **Source & docs:** https://github.com/LaelaZorana/synthkit
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """synthkit — Hugging Face Space (Gradio).
2
+
3
+ A small live demo of the synthkit quality grader and offline generator. This
4
+ Space runs fully offline: template-based generation (no API key) and the lexical
5
+ grading axes. The CLI adds LLM-backed generation, fine-tuning output formats, and
6
+ an embedding-based semantic-dedup axis. Source: https://github.com/LaelaZorana/synthkit
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import html
11
+ import json
12
+
13
+ import gradio as gr
14
+
15
+ from synthkit import __version__
16
+ from synthkit.grading import grade_dataset
17
+ from synthkit.report import to_html, to_json
18
+ from synthkit.text.generate import generate
19
+ from synthkit.text.seeds import BUILTIN_SEEDS
20
+
21
+ GH = "https://github.com/LaelaZorana/synthkit"
22
+
23
+
24
+ def _records_from(text: str, file) -> list:
25
+ raw = ""
26
+ if file is not None:
27
+ path = file if isinstance(file, str) else getattr(file, "name", None)
28
+ with open(path, "r", encoding="utf-8") as fh:
29
+ raw = fh.read()
30
+ elif text and text.strip():
31
+ raw = text
32
+ raw = raw.strip()
33
+ if not raw:
34
+ raise ValueError("Paste some JSONL/JSON or upload a file first.")
35
+ if raw[0] == "[":
36
+ recs = json.loads(raw)
37
+ else:
38
+ recs = [json.loads(line) for line in raw.splitlines() if line.strip()]
39
+ if not isinstance(recs, list) or not recs:
40
+ raise ValueError("No records found.")
41
+ return recs
42
+
43
+
44
+ def _iframe(report_html: str) -> str:
45
+ return (f'<iframe srcdoc="{html.escape(report_html, quote=True)}" '
46
+ 'style="width:100%;height:760px;border:0;border-radius:14px;background:#0b1020">'
47
+ '</iframe>')
48
+
49
+
50
+ def _err(exc: Exception) -> str:
51
+ return f'<p style="color:#dc2626;font:14px sans-serif">⚠ {html.escape(str(exc))}</p>'
52
+
53
+
54
+ def grade_ui(dataset_text, dataset_file, eval_text):
55
+ try:
56
+ records = _records_from(dataset_text, dataset_file)
57
+ against = None
58
+ if eval_text and eval_text.strip():
59
+ against = [json.loads(l) for l in eval_text.splitlines() if l.strip()]
60
+ report = grade_dataset(records, against=against)
61
+ return _iframe(to_html(report, "uploaded dataset")), to_json(report, "uploaded dataset")
62
+ except Exception as exc: # noqa: BLE001 - surface any parse/grade error to the UI
63
+ return _err(exc), ""
64
+
65
+
66
+ def generate_ui(seed_choice, seed_text, n):
67
+ try:
68
+ spec = json.loads(seed_text) if seed_text and seed_text.strip() else BUILTIN_SEEDS[seed_choice]
69
+ data = generate(spec, int(n), seed=17)
70
+ report = grade_dataset(data)
71
+ jsonl = "\n".join(json.dumps(r, ensure_ascii=False) for r in data)
72
+ return _iframe(to_html(report, "generated dataset")), jsonl
73
+ except Exception as exc: # noqa: BLE001
74
+ return _err(exc), ""
75
+
76
+
77
+ with gr.Blocks(title="synthkit", theme=gr.themes.Soft(), css="footer{visibility:hidden}") as demo:
78
+ gr.Markdown(
79
+ "# 🧪 synthkit\n"
80
+ "**Generate synthetic data — and grade it before you train on it.** "
81
+ "Scored on validity · uniqueness · diversity · contamination, with an A+→F headline.\n\n"
82
+ f"This Space runs offline (template generation + lexical grading). The full "
83
+ f"[CLI]({GH}) adds LLM-backed instruction data, fine-tuning formats "
84
+ f"(alpaca/sharegpt/openai), and an embedding semantic-dedup axis. · v{__version__}")
85
+
86
+ with gr.Tab("Grade a dataset"):
87
+ gr.Markdown("Upload or paste a `.jsonl` (one JSON record per line). Optionally "
88
+ "paste a held-out eval set to check **contamination** (train/eval leakage).")
89
+ with gr.Row():
90
+ with gr.Column():
91
+ g_file = gr.File(label="Dataset (.jsonl / .json)",
92
+ file_types=[".jsonl", ".json", ".txt"])
93
+ g_text = gr.Textbox(label="…or paste records", lines=8,
94
+ placeholder='{"prompt": "Explain gradient descent"}\n'
95
+ '{"prompt": "What is a hash map?"}')
96
+ g_eval = gr.Textbox(label="Held-out eval set for contamination (optional)",
97
+ lines=3, placeholder='{"prompt": "Explain gradient descent"}')
98
+ g_btn = gr.Button("Grade", variant="primary")
99
+ with gr.Column():
100
+ g_html = gr.HTML()
101
+ g_json = gr.Code(label="report.json", language="json")
102
+ g_btn.click(grade_ui, [g_text, g_file, g_eval], [g_html, g_json])
103
+
104
+ with gr.Tab("Generate (offline)"):
105
+ gr.Markdown("Generate eval prompts from a built-in seed, or paste your own seed "
106
+ "spec (templates + slots). The result is graded automatically.")
107
+ with gr.Row():
108
+ with gr.Column():
109
+ s_choice = gr.Dropdown(list(BUILTIN_SEEDS), value="eval", label="Built-in seed")
110
+ s_n = gr.Slider(10, 300, value=120, step=10, label="How many records")
111
+ s_text = gr.Textbox(
112
+ label="…or paste a seed spec (JSON)", lines=8,
113
+ placeholder='{"kind":"eval","templates":["{a} about {b}"],'
114
+ '"slots":{"a":["Explain","Summarize"],"b":["TLS","DNS","OAuth"]},'
115
+ '"response":{"mode":"none"}}')
116
+ s_btn = gr.Button("Generate + grade", variant="primary")
117
+ with gr.Column():
118
+ s_html = gr.HTML()
119
+ s_jsonl = gr.Code(label="dataset.jsonl")
120
+ s_btn.click(generate_ui, [s_choice, s_text, s_n], [s_html, s_jsonl])
121
+
122
+ gr.Markdown(f"[⭐ Source on GitHub]({GH}) · MIT-licensed · zero-dependency core")
123
+
124
+ if __name__ == "__main__":
125
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ git+https://github.com/LaelaZorana/synthkit.git