v0.4.0 — hardened app (input/N caps, queue) + pinned dependency
Browse files- app.py +12 -4
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -21,14 +21,20 @@ from synthkit.text.seeds import BUILTIN_SEEDS
|
|
| 21 |
GH = "https://github.com/LaelaZorana/synthkit"
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def _records_from(text: str, file) -> list:
|
| 25 |
raw = ""
|
| 26 |
if file is not None:
|
| 27 |
path = file if isinstance(file, str) else getattr(file, "name", None)
|
| 28 |
with open(path, "r", encoding="utf-8") as fh:
|
| 29 |
-
raw = fh.read()
|
| 30 |
elif text and text.strip():
|
| 31 |
raw = text
|
|
|
|
|
|
|
| 32 |
raw = raw.strip()
|
| 33 |
if not raw:
|
| 34 |
raise ValueError("Paste some JSONL/JSON or upload a file first.")
|
|
@@ -38,6 +44,8 @@ def _records_from(text: str, file) -> list:
|
|
| 38 |
recs = [json.loads(line) for line in raw.splitlines() if line.strip()]
|
| 39 |
if not isinstance(recs, list) or not recs:
|
| 40 |
raise ValueError("No records found.")
|
|
|
|
|
|
|
| 41 |
return recs
|
| 42 |
|
| 43 |
|
|
@@ -56,7 +64,7 @@ def grade_ui(dataset_text, dataset_file, eval_text):
|
|
| 56 |
records = _records_from(dataset_text, dataset_file)
|
| 57 |
against = None
|
| 58 |
if eval_text and eval_text.strip():
|
| 59 |
-
against = [json.loads(
|
| 60 |
report = grade_dataset(records, against=against)
|
| 61 |
return _iframe(to_html(report, "uploaded dataset")), to_json(report, "uploaded dataset")
|
| 62 |
except Exception as exc: # noqa: BLE001 - surface any parse/grade error to the UI
|
|
@@ -66,7 +74,7 @@ def grade_ui(dataset_text, dataset_file, eval_text):
|
|
| 66 |
def generate_ui(seed_choice, seed_text, n):
|
| 67 |
try:
|
| 68 |
spec = json.loads(seed_text) if seed_text and seed_text.strip() else BUILTIN_SEEDS[seed_choice]
|
| 69 |
-
data = generate(spec, int(n), seed=17)
|
| 70 |
report = grade_dataset(data)
|
| 71 |
jsonl = "\n".join(json.dumps(r, ensure_ascii=False) for r in data)
|
| 72 |
return _iframe(to_html(report, "generated dataset")), jsonl
|
|
@@ -122,4 +130,4 @@ with gr.Blocks(title="synthkit", theme=gr.themes.Soft(), css="footer{visibility:
|
|
| 122 |
gr.Markdown(f"[⭐ Source on GitHub]({GH}) · MIT-licensed · zero-dependency core")
|
| 123 |
|
| 124 |
if __name__ == "__main__":
|
| 125 |
-
demo.launch()
|
|
|
|
| 21 |
GH = "https://github.com/LaelaZorana/synthkit"
|
| 22 |
|
| 23 |
|
| 24 |
+
_MAX_INPUT_BYTES = 4_000_000 # ~4 MB ceiling on pasted/uploaded data (public demo)
|
| 25 |
+
_MAX_RECORDS = 5000 # cap rows graded per request
|
| 26 |
+
|
| 27 |
+
|
| 28 |
def _records_from(text: str, file) -> list:
|
| 29 |
raw = ""
|
| 30 |
if file is not None:
|
| 31 |
path = file if isinstance(file, str) else getattr(file, "name", None)
|
| 32 |
with open(path, "r", encoding="utf-8") as fh:
|
| 33 |
+
raw = fh.read(_MAX_INPUT_BYTES + 1)
|
| 34 |
elif text and text.strip():
|
| 35 |
raw = text
|
| 36 |
+
if len(raw) > _MAX_INPUT_BYTES:
|
| 37 |
+
raise ValueError(f"input too large (limit {_MAX_INPUT_BYTES // 1_000_000} MB)")
|
| 38 |
raw = raw.strip()
|
| 39 |
if not raw:
|
| 40 |
raise ValueError("Paste some JSONL/JSON or upload a file first.")
|
|
|
|
| 44 |
recs = [json.loads(line) for line in raw.splitlines() if line.strip()]
|
| 45 |
if not isinstance(recs, list) or not recs:
|
| 46 |
raise ValueError("No records found.")
|
| 47 |
+
if len(recs) > _MAX_RECORDS:
|
| 48 |
+
raise ValueError(f"too many records ({len(recs)}); this demo caps at {_MAX_RECORDS}")
|
| 49 |
return recs
|
| 50 |
|
| 51 |
|
|
|
|
| 64 |
records = _records_from(dataset_text, dataset_file)
|
| 65 |
against = None
|
| 66 |
if eval_text and eval_text.strip():
|
| 67 |
+
against = [json.loads(ln) for ln in eval_text.splitlines() if ln.strip()][:_MAX_RECORDS]
|
| 68 |
report = grade_dataset(records, against=against)
|
| 69 |
return _iframe(to_html(report, "uploaded dataset")), to_json(report, "uploaded dataset")
|
| 70 |
except Exception as exc: # noqa: BLE001 - surface any parse/grade error to the UI
|
|
|
|
| 74 |
def generate_ui(seed_choice, seed_text, n):
|
| 75 |
try:
|
| 76 |
spec = json.loads(seed_text) if seed_text and seed_text.strip() else BUILTIN_SEEDS[seed_choice]
|
| 77 |
+
data = generate(spec, max(1, min(int(n), 300)), seed=17) # hard-cap N on the public demo
|
| 78 |
report = grade_dataset(data)
|
| 79 |
jsonl = "\n".join(json.dumps(r, ensure_ascii=False) for r in data)
|
| 80 |
return _iframe(to_html(report, "generated dataset")), jsonl
|
|
|
|
| 130 |
gr.Markdown(f"[⭐ Source on GitHub]({GH}) · MIT-licensed · zero-dependency core")
|
| 131 |
|
| 132 |
if __name__ == "__main__":
|
| 133 |
+
demo.queue(default_concurrency_limit=2, max_size=24).launch()
|
requirements.txt
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
git+https://github.com/LaelaZorana/synthkit.git
|
|
|
|
| 1 |
+
synthkit @ git+https://github.com/LaelaZorana/synthkit.git@v0.4.0
|