Spaces:

LaelaZ
/

synthkit

Sleeping

App Files Files Community

LaelaZ commited on 5 days ago

Commit

1b1d946

verified ·

1 Parent(s): 23dc4b1

synthkit Space v0.3.0 — grade + offline generate demo

Browse files

Files changed (3) hide show

README.md +17 -7
app.py +125 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,13 +1,23 @@
 ---
-title: Synthkit
-emoji: 🦀
-colorFrom: purple
-colorTo: yellow
 sdk: gradio
-sdk_version: 6.16.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: synthkit
+emoji: 🧪
+colorFrom: indigo
+colorTo: blue
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+# synthkit — synthetic data, graded
+Generate synthetic LLM data and **grade it** on validity, uniqueness, diversity,
+and contamination, with an A+→F headline. This Space is a live, offline demo
+(template generation + lexical grading).
+The full command-line tool adds LLM-backed instruction→output generation,
+fine-tuning output formats (alpaca / sharegpt / openai), and an embedding-based
+semantic-dedup axis that catches paraphrase duplicates lexical methods miss.
+👉 **Source & docs:** https://github.com/LaelaZorana/synthkit

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""synthkit — Hugging Face Space (Gradio).
+A small live demo of the synthkit quality grader and offline generator. This
+Space runs fully offline: template-based generation (no API key) and the lexical
+grading axes. The CLI adds LLM-backed generation, fine-tuning output formats, and
+an embedding-based semantic-dedup axis. Source: https://github.com/LaelaZorana/synthkit
+"""
+from __future__ import annotations
+import html
+import json
+import gradio as gr
+from synthkit import __version__
+from synthkit.grading import grade_dataset
+from synthkit.report import to_html, to_json
+from synthkit.text.generate import generate
+from synthkit.text.seeds import BUILTIN_SEEDS
+GH = "https://github.com/LaelaZorana/synthkit"
+def _records_from(text: str, file) -> list:
+    raw = ""
+    if file is not None:
+        path = file if isinstance(file, str) else getattr(file, "name", None)
+        with open(path, "r", encoding="utf-8") as fh:
+            raw = fh.read()
+    elif text and text.strip():
+        raw = text
+    raw = raw.strip()
+    if not raw:
+        raise ValueError("Paste some JSONL/JSON or upload a file first.")
+    if raw[0] == "[":
+        recs = json.loads(raw)
+    else:
+        recs = [json.loads(line) for line in raw.splitlines() if line.strip()]
+    if not isinstance(recs, list) or not recs:
+        raise ValueError("No records found.")
+    return recs
+def _iframe(report_html: str) -> str:
+    return (f'<iframe srcdoc="{html.escape(report_html, quote=True)}" '
+            'style="width:100%;height:760px;border:0;border-radius:14px;background:#0b1020">'
+            '</iframe>')
+def _err(exc: Exception) -> str:
+    return f'<p style="color:#dc2626;font:14px sans-serif">⚠ {html.escape(str(exc))}</p>'
+def grade_ui(dataset_text, dataset_file, eval_text):
+    try:
+        records = _records_from(dataset_text, dataset_file)
+        against = None
+        if eval_text and eval_text.strip():
+            against = [json.loads(l) for l in eval_text.splitlines() if l.strip()]
+        report = grade_dataset(records, against=against)
+        return _iframe(to_html(report, "uploaded dataset")), to_json(report, "uploaded dataset")
+    except Exception as exc:  # noqa: BLE001 - surface any parse/grade error to the UI
+        return _err(exc), ""
+def generate_ui(seed_choice, seed_text, n):
+    try:
+        spec = json.loads(seed_text) if seed_text and seed_text.strip() else BUILTIN_SEEDS[seed_choice]
+        data = generate(spec, int(n), seed=17)
+        report = grade_dataset(data)
+        jsonl = "\n".join(json.dumps(r, ensure_ascii=False) for r in data)
+        return _iframe(to_html(report, "generated dataset")), jsonl
+    except Exception as exc:  # noqa: BLE001
+        return _err(exc), ""
+with gr.Blocks(title="synthkit", theme=gr.themes.Soft(), css="footer{visibility:hidden}") as demo:
+    gr.Markdown(
+        "# 🧪 synthkit\n"
+        "**Generate synthetic data — and grade it before you train on it.** "
+        "Scored on validity · uniqueness · diversity · contamination, with an A+→F headline.\n\n"
+        f"This Space runs offline (template generation + lexical grading). The full "
+        f"[CLI]({GH}) adds LLM-backed instruction data, fine-tuning formats "
+        f"(alpaca/sharegpt/openai), and an embedding semantic-dedup axis. · v{__version__}")
+    with gr.Tab("Grade a dataset"):
+        gr.Markdown("Upload or paste a `.jsonl` (one JSON record per line). Optionally "
+                    "paste a held-out eval set to check **contamination** (train/eval leakage).")
+        with gr.Row():
+            with gr.Column():
+                g_file = gr.File(label="Dataset (.jsonl / .json)",
+                                 file_types=[".jsonl", ".json", ".txt"])
+                g_text = gr.Textbox(label="…or paste records", lines=8,
+                                    placeholder='{"prompt": "Explain gradient descent"}\n'
+                                                '{"prompt": "What is a hash map?"}')
+                g_eval = gr.Textbox(label="Held-out eval set for contamination (optional)",
+                                    lines=3, placeholder='{"prompt": "Explain gradient descent"}')
+                g_btn = gr.Button("Grade", variant="primary")
+            with gr.Column():
+                g_html = gr.HTML()
+        g_json = gr.Code(label="report.json", language="json")
+        g_btn.click(grade_ui, [g_text, g_file, g_eval], [g_html, g_json])
+    with gr.Tab("Generate (offline)"):
+        gr.Markdown("Generate eval prompts from a built-in seed, or paste your own seed "
+                    "spec (templates + slots). The result is graded automatically.")
+        with gr.Row():
+            with gr.Column():
+                s_choice = gr.Dropdown(list(BUILTIN_SEEDS), value="eval", label="Built-in seed")
+                s_n = gr.Slider(10, 300, value=120, step=10, label="How many records")
+                s_text = gr.Textbox(
+                    label="…or paste a seed spec (JSON)", lines=8,
+                    placeholder='{"kind":"eval","templates":["{a} about {b}"],'
+                                '"slots":{"a":["Explain","Summarize"],"b":["TLS","DNS","OAuth"]},'
+                                '"response":{"mode":"none"}}')
+                s_btn = gr.Button("Generate + grade", variant="primary")
+            with gr.Column():
+                s_html = gr.HTML()
+        s_jsonl = gr.Code(label="dataset.jsonl")
+        s_btn.click(generate_ui, [s_choice, s_text, s_n], [s_html, s_jsonl])
+    gr.Markdown(f"[⭐ Source on GitHub]({GH}) · MIT-licensed · zero-dependency core")
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ git+https://github.com/LaelaZorana/synthkit.git