LaelaZ commited on
Commit
3b33545
·
verified ·
1 Parent(s): 3c6c9bf

v0.4.0 — hardened app (input/N caps, queue) + pinned dependency

Browse files
Files changed (2) hide show
  1. app.py +12 -4
  2. requirements.txt +1 -1
app.py CHANGED
@@ -21,14 +21,20 @@ from synthkit.text.seeds import BUILTIN_SEEDS
21
  GH = "https://github.com/LaelaZorana/synthkit"
22
 
23
 
 
 
 
 
24
  def _records_from(text: str, file) -> list:
25
  raw = ""
26
  if file is not None:
27
  path = file if isinstance(file, str) else getattr(file, "name", None)
28
  with open(path, "r", encoding="utf-8") as fh:
29
- raw = fh.read()
30
  elif text and text.strip():
31
  raw = text
 
 
32
  raw = raw.strip()
33
  if not raw:
34
  raise ValueError("Paste some JSONL/JSON or upload a file first.")
@@ -38,6 +44,8 @@ def _records_from(text: str, file) -> list:
38
  recs = [json.loads(line) for line in raw.splitlines() if line.strip()]
39
  if not isinstance(recs, list) or not recs:
40
  raise ValueError("No records found.")
 
 
41
  return recs
42
 
43
 
@@ -56,7 +64,7 @@ def grade_ui(dataset_text, dataset_file, eval_text):
56
  records = _records_from(dataset_text, dataset_file)
57
  against = None
58
  if eval_text and eval_text.strip():
59
- against = [json.loads(l) for l in eval_text.splitlines() if l.strip()]
60
  report = grade_dataset(records, against=against)
61
  return _iframe(to_html(report, "uploaded dataset")), to_json(report, "uploaded dataset")
62
  except Exception as exc: # noqa: BLE001 - surface any parse/grade error to the UI
@@ -66,7 +74,7 @@ def grade_ui(dataset_text, dataset_file, eval_text):
66
  def generate_ui(seed_choice, seed_text, n):
67
  try:
68
  spec = json.loads(seed_text) if seed_text and seed_text.strip() else BUILTIN_SEEDS[seed_choice]
69
- data = generate(spec, int(n), seed=17)
70
  report = grade_dataset(data)
71
  jsonl = "\n".join(json.dumps(r, ensure_ascii=False) for r in data)
72
  return _iframe(to_html(report, "generated dataset")), jsonl
@@ -122,4 +130,4 @@ with gr.Blocks(title="synthkit", theme=gr.themes.Soft(), css="footer{visibility:
122
  gr.Markdown(f"[⭐ Source on GitHub]({GH}) · MIT-licensed · zero-dependency core")
123
 
124
  if __name__ == "__main__":
125
- demo.launch()
 
21
  GH = "https://github.com/LaelaZorana/synthkit"
22
 
23
 
24
+ _MAX_INPUT_BYTES = 4_000_000 # ~4 MB ceiling on pasted/uploaded data (public demo)
25
+ _MAX_RECORDS = 5000 # cap rows graded per request
26
+
27
+
28
  def _records_from(text: str, file) -> list:
29
  raw = ""
30
  if file is not None:
31
  path = file if isinstance(file, str) else getattr(file, "name", None)
32
  with open(path, "r", encoding="utf-8") as fh:
33
+ raw = fh.read(_MAX_INPUT_BYTES + 1)
34
  elif text and text.strip():
35
  raw = text
36
+ if len(raw) > _MAX_INPUT_BYTES:
37
+ raise ValueError(f"input too large (limit {_MAX_INPUT_BYTES // 1_000_000} MB)")
38
  raw = raw.strip()
39
  if not raw:
40
  raise ValueError("Paste some JSONL/JSON or upload a file first.")
 
44
  recs = [json.loads(line) for line in raw.splitlines() if line.strip()]
45
  if not isinstance(recs, list) or not recs:
46
  raise ValueError("No records found.")
47
+ if len(recs) > _MAX_RECORDS:
48
+ raise ValueError(f"too many records ({len(recs)}); this demo caps at {_MAX_RECORDS}")
49
  return recs
50
 
51
 
 
64
  records = _records_from(dataset_text, dataset_file)
65
  against = None
66
  if eval_text and eval_text.strip():
67
+ against = [json.loads(ln) for ln in eval_text.splitlines() if ln.strip()][:_MAX_RECORDS]
68
  report = grade_dataset(records, against=against)
69
  return _iframe(to_html(report, "uploaded dataset")), to_json(report, "uploaded dataset")
70
  except Exception as exc: # noqa: BLE001 - surface any parse/grade error to the UI
 
74
  def generate_ui(seed_choice, seed_text, n):
75
  try:
76
  spec = json.loads(seed_text) if seed_text and seed_text.strip() else BUILTIN_SEEDS[seed_choice]
77
+ data = generate(spec, max(1, min(int(n), 300)), seed=17) # hard-cap N on the public demo
78
  report = grade_dataset(data)
79
  jsonl = "\n".join(json.dumps(r, ensure_ascii=False) for r in data)
80
  return _iframe(to_html(report, "generated dataset")), jsonl
 
130
  gr.Markdown(f"[⭐ Source on GitHub]({GH}) · MIT-licensed · zero-dependency core")
131
 
132
  if __name__ == "__main__":
133
+ demo.queue(default_concurrency_limit=2, max_size=24).launch()
requirements.txt CHANGED
@@ -1 +1 @@
1
- git+https://github.com/LaelaZorana/synthkit.git
 
1
+ synthkit @ git+https://github.com/LaelaZorana/synthkit.git@v0.4.0