Mfischthal commited on
Commit
d70667c
·
verified ·
1 Parent(s): 758613a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +9 -13
  2. data_io.py +45 -16
app.py CHANGED
@@ -1,14 +1,13 @@
 
1
  import os
2
  import gradio as gr
3
  from typing import List, Dict, Any, Tuple
4
 
5
- # Local imports
6
  from data_io import load_from_hub_or_upload
7
  from teacher import call_teacher, MODEL, INSTRUCTION
8
  from validators import validate_output
9
  from exporters import to_jsonl, to_hf_dataset
10
 
11
- # ---------------- State ----------------
12
  SESSION: Dict[str, Any] = {
13
  "passages": [],
14
  "records": [],
@@ -21,12 +20,12 @@ DESCRIPTION = (
21
  "review & edit, and export JSONL / HF Datasets."
22
  )
23
 
24
- # ---------------- Callbacks ----------------
25
- def on_prepare(src_mode: str, hf_id: str, upload, sample: float, min_words: float, chunk: float) -> str:
26
  sample_i = int(sample) if sample else 0
27
  min_words_i = int(min_words) if min_words else 80
28
  chunk_i = int(chunk) if chunk else 1200
29
- passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, sample_i, min_words_i, chunk_i)
 
30
  SESSION["passages"] = passages
31
  SESSION["dataset_id"] = dataset_id
32
  SESSION["records"] = []
@@ -36,9 +35,7 @@ def on_generate(model_name: str, temperature: float) -> Tuple[str, list]:
36
  if not SESSION["passages"]:
37
  return "No passages prepared yet.", []
38
  os.environ["OPENAI_MODEL"] = model_name
39
- rows = []
40
- records = []
41
- ok = bad = 0
42
  for i, p in enumerate(SESSION["passages"]):
43
  y = call_teacher(p, temperature=float(temperature))
44
  status = "unreviewed"
@@ -94,7 +91,6 @@ def on_push(push_repo: str, private_toggle: bool) -> str:
94
  )
95
  return f"Pushed {len(ds)} records to {push_repo}"
96
 
97
- # ---------------- UI ----------------
98
  def build_ui():
99
  with gr.Blocks(title="Dialogue→Speaker Dataset Builder", theme=gr.themes.Default()) as demo:
100
  gr.Markdown("# Dialogue→Speaker Dataset Builder")
@@ -104,9 +100,10 @@ def build_ui():
104
  src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source")
105
  hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)")
106
  upload = gr.File(file_types=[".txt"], label="Upload a .txt file")
107
- sample = gr.Number(value=200, label="Sample passages (0 = all)")
108
  min_words = gr.Number(value=80, label="Min words per passage")
109
  chunk = gr.Number(value=1200, label="Chunk size (chars)")
 
110
  btn_prep = gr.Button("Prepare passages")
111
  info_data = gr.Markdown()
112
 
@@ -138,8 +135,7 @@ def build_ui():
138
  instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False)
139
  gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.")
140
 
141
- # Wire callbacks
142
- btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk], [info_data])
143
  btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table])
144
  btn_load.click(on_load, [idx], [inp, out, status])
145
  btn_save.click(on_save, [idx, out, status], [review_msg])
@@ -151,4 +147,4 @@ def build_ui():
151
  demo = build_ui()
152
 
153
  if __name__ == "__main__":
154
- demo.launch()
 
1
+
2
  import os
3
  import gradio as gr
4
  from typing import List, Dict, Any, Tuple
5
 
 
6
  from data_io import load_from_hub_or_upload
7
  from teacher import call_teacher, MODEL, INSTRUCTION
8
  from validators import validate_output
9
  from exporters import to_jsonl, to_hf_dataset
10
 
 
11
  SESSION: Dict[str, Any] = {
12
  "passages": [],
13
  "records": [],
 
20
  "review & edit, and export JSONL / HF Datasets."
21
  )
22
 
23
+ def on_prepare(src_mode: str, hf_id: str, upload, sample: float, min_words: float, chunk: float, quote_pairs: float) -> str:
 
24
  sample_i = int(sample) if sample else 0
25
  min_words_i = int(min_words) if min_words else 80
26
  chunk_i = int(chunk) if chunk else 1200
27
+ qpairs_i = int(quote_pairs) if quote_pairs else 0
28
+ passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, sample_i, min_words_i, chunk_i, quote_pairs=qpairs_i)
29
  SESSION["passages"] = passages
30
  SESSION["dataset_id"] = dataset_id
31
  SESSION["records"] = []
 
35
  if not SESSION["passages"]:
36
  return "No passages prepared yet.", []
37
  os.environ["OPENAI_MODEL"] = model_name
38
+ rows, records, ok, bad = [], [], 0, 0
 
 
39
  for i, p in enumerate(SESSION["passages"]):
40
  y = call_teacher(p, temperature=float(temperature))
41
  status = "unreviewed"
 
91
  )
92
  return f"Pushed {len(ds)} records to {push_repo}"
93
 
 
94
  def build_ui():
95
  with gr.Blocks(title="Dialogue→Speaker Dataset Builder", theme=gr.themes.Default()) as demo:
96
  gr.Markdown("# Dialogue→Speaker Dataset Builder")
 
100
  src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source")
101
  hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)")
102
  upload = gr.File(file_types=[".txt"], label="Upload a .txt file")
103
+ sample = gr.Number(value=5, label="Sample passages (0 = all)")
104
  min_words = gr.Number(value=80, label="Min words per passage")
105
  chunk = gr.Number(value=1200, label="Chunk size (chars)")
106
+ quote_pairs = gr.Number(value=1, label="Min dialogue quote-pairs (0 = no filter)")
107
  btn_prep = gr.Button("Prepare passages")
108
  info_data = gr.Markdown()
109
 
 
135
  instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False)
136
  gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.")
137
 
138
+ btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk, quote_pairs], [info_data])
 
139
  btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table])
140
  btn_load.click(on_load, [idx], [inp, out, status])
141
  btn_save.click(on_save, [idx, out, status], [review_msg])
 
147
  demo = build_ui()
148
 
149
  if __name__ == "__main__":
150
+ demo.launch()
data_io.py CHANGED
@@ -1,10 +1,13 @@
 
1
  from datasets import load_dataset
2
  from ftfy import fix_text
3
  import regex as re
4
- from typing import List, Tuple
5
 
6
  DEF_CHUNK = 1200
7
 
 
 
8
  def ascii_quotes(s: str) -> str:
9
  return (s.replace("“","\"").replace("”","\"")
10
  .replace("‘","'").replace("’","'")
@@ -22,23 +25,47 @@ def split_passages(text: str, max_chars: int = DEF_CHUNK) -> List[str]:
22
  if buf: out.append(buf)
23
  return out
24
 
25
- def load_from_hub_or_upload(src_mode: str, dataset_id: str, upload_file, sample: int, min_words: int, chunk: int) -> Tuple[List[str], str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  passages: List[str] = []
27
  actual_id = None
 
 
28
  if src_mode == "HF Dataset":
29
- ds = load_dataset(dataset_id, split="train")
30
- for ex in ds:
31
- raw = ex.get("text", "") or ""
32
- if not raw.strip():
33
- continue
34
- tx = ascii_quotes(fix_text(raw)).strip()
35
- for p in split_passages(tx, max_chars=int(chunk)):
36
- if len(p.split()) < int(min_words):
37
- continue
38
- passages.append(p)
39
- if sample and len(passages) >= int(sample):
40
- break
41
- if sample and len(passages) >= int(sample):
42
  break
43
  actual_id = dataset_id
44
  else:
@@ -49,8 +76,10 @@ def load_from_hub_or_upload(src_mode: str, dataset_id: str, upload_file, sample:
49
  for p in split_passages(tx, max_chars=int(chunk)):
50
  if len(p.split()) < int(min_words):
51
  continue
 
 
52
  passages.append(p)
53
- if sample and len(passages) >= int(sample):
54
  break
55
  actual_id = getattr(upload_file, 'name', 'upload.txt')
56
 
 
1
+
2
  from datasets import load_dataset
3
  from ftfy import fix_text
4
  import regex as re
5
+ from typing import List, Tuple, Iterable, Optional
6
 
7
  DEF_CHUNK = 1200
8
 
9
+ CANDIDATE_TEXT_FIELDS = ["text", "content", "body", "article", "raw"]
10
+
11
  def ascii_quotes(s: str) -> str:
12
  return (s.replace("“","\"").replace("”","\"")
13
  .replace("‘","'").replace("’","'")
 
25
  if buf: out.append(buf)
26
  return out
27
 
28
+ def pick_text(example: dict) -> Optional[str]:
29
+ for key in CANDIDATE_TEXT_FIELDS:
30
+ val = example.get(key, None)
31
+ if isinstance(val, str) and val.strip():
32
+ return val
33
+ # fallback: find the longest string value
34
+ strings = [str(v) for v in example.values() if isinstance(v, str)]
35
+ if strings:
36
+ return max(strings, key=len)
37
+ return None
38
+
39
+ def has_enough_quotes(passage: str, min_pairs: int = 1) -> bool:
40
+ # Count double quotes after normalization
41
+ q = passage.count('"')
42
+ return (q // 2) >= min_pairs
43
+
44
+ def iter_passages_streaming(dataset_id: str, split: str = "train", min_words: int = 80, chunk: int = DEF_CHUNK, quote_pairs: int = 0):
45
+ """Stream records without downloading full dataset; yields normalized, chunked passages."""
46
+ ds = load_dataset(dataset_id, split=split, streaming=True)
47
+ for ex in ds:
48
+ raw = pick_text(ex) or ""
49
+ if not raw.strip():
50
+ continue
51
+ tx = ascii_quotes(fix_text(raw)).strip()
52
+ for p in split_passages(tx, max_chars=int(chunk)):
53
+ if len(p.split()) < int(min_words):
54
+ continue
55
+ if quote_pairs and not has_enough_quotes(p, min_pairs=quote_pairs):
56
+ continue
57
+ yield p
58
+
59
+ def load_from_hub_or_upload(src_mode: str, dataset_id: str, upload_file, sample: int, min_words: int, chunk: int, quote_pairs: int = 0) -> Tuple[List[str], str]:
60
+ """Return up to `sample` passages; uses streaming for HF datasets to avoid full downloads."""
61
  passages: List[str] = []
62
  actual_id = None
63
+ cap = int(sample) if sample else 0
64
+
65
  if src_mode == "HF Dataset":
66
+ for p in iter_passages_streaming(dataset_id, split="train", min_words=min_words, chunk=chunk, quote_pairs=quote_pairs):
67
+ passages.append(p)
68
+ if cap and len(passages) >= cap:
 
 
 
 
 
 
 
 
 
 
69
  break
70
  actual_id = dataset_id
71
  else:
 
76
  for p in split_passages(tx, max_chars=int(chunk)):
77
  if len(p.split()) < int(min_words):
78
  continue
79
+ if quote_pairs and not has_enough_quotes(p, min_pairs=quote_pairs):
80
+ continue
81
  passages.append(p)
82
+ if cap and len(passages) >= cap:
83
  break
84
  actual_id = getattr(upload_file, 'name', 'upload.txt')
85