Mfischthal commited on
Commit
f432fa9
·
verified ·
1 Parent(s): e5560f2

Upload 7 files

Browse files
Files changed (7) hide show
  1. README.md +11 -1
  2. app.py +135 -1
  3. data_io.py +57 -1
  4. exporters.py +19 -1
  5. requirements.txt +8 -1
  6. teacher.py +41 -1
  7. validators.py +19 -1
README.md CHANGED
@@ -1 +1,11 @@
1
- Placeholder for README.md. Replace with actual code from spec.
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dialogue→Speaker Dataset Builder (HF Spaces)
2
+ A GUI app (Gradio) that prepares text passages, calls the OpenAI API to structure dialogue into `Speaker N:` lines, lets you review & edit, and exports JSONL or a HF Dataset.
3
+
4
+ ## Quickstart (HF Spaces)
5
+ 1. Create a new Space → SDK: **Gradio**.
6
+ 2. Add **Secrets**:
7
+ - `OPENAI_API_KEY` (required)
8
+ - `OPENAI_MODEL` (optional, default `gpt-4o-mini`)
9
+ - `HF_TOKEN` (optional, for push_to_hub)
10
+ 3. Upload all these files.
11
+ 4. Launch the Space.
app.py CHANGED
@@ -1 +1,135 @@
1
- Placeholder for app.py. Replace with actual code from spec.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from typing import List, Dict, Any
4
+ from data_io import load_from_hub_or_upload
5
+ from teacher import call_teacher, MODEL, INSTRUCTION
6
+ from validators import validate_output
7
+ from exporters import to_jsonl, to_hf_dataset
8
+
9
+ SESSION: Dict[str, Any] = {
10
+ "passages": [],
11
+ "records": [],
12
+ "dataset_id": None,
13
+ }
14
+
15
+ DESCRIPTION = """### Dialogue→Speaker Dataset Builder
16
+ A Gradio app that prepares passages, generates `Speaker N:`-structured dialogue via the OpenAI API, lets you review & edit, and exports JSONL / HF Datasets."""
17
+
18
+ with gr.Blocks(title="Dialogue→Speaker Dataset Builder") as demo:
19
+ gr.Markdown("# Dialogue→Speaker Dataset Builder")
20
+ gr.Markdown(DESCRIPTION)
21
+
22
+ with gr.Tab("Data"):
23
+ src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source")
24
+ hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)")
25
+ upload = gr.File(file_types=[".txt"], label="Upload a .txt file")
26
+ sample = gr.Number(value=200, label="Sample passages (0 = all)")
27
+ min_words = gr.Number(value=80, label="Min words per passage")
28
+ chunk = gr.Number(value=1200, label="Chunk size (chars)")
29
+ btn_prep = gr.Button("Prepare passages")
30
+ info_data = gr.Markdown()
31
+
32
+ with gr.Tab("Generation"):
33
+ model_box = gr.Textbox(value=os.getenv("OPENAI_MODEL", MODEL), label="OpenAI model")
34
+ temperature = gr.Slider(0, 1, value=0.0, step=0.1, label="Temperature")
35
+ btn_gen = gr.Button("Generate with OpenAI")
36
+ progress_gen = gr.Markdown()
37
+ rec_table = gr.Dataframe(headers=["#", "status", "chars"], row_count=(0, "dynamic"))
38
+
39
+ with gr.Tab("Review"):
40
+ idx = gr.Number(value=0, label="Record #")
41
+ inp = gr.Textbox(lines=12, label="Input passage", interactive=False)
42
+ out = gr.Textbox(lines=12, label="Output (edit)")
43
+ status = gr.Dropdown(["accepted","needs_work","unreviewed"], value="unreviewed", label="Status")
44
+ btn_load = gr.Button("Load record")
45
+ btn_save = gr.Button("Save changes")
46
+ review_msg = gr.Markdown()
47
+
48
+ with gr.Tab("Export"):
49
+ btn_jsonl = gr.Button("Download JSONL")
50
+ dl_path = gr.Textbox(label="JSONL path")
51
+ push_repo = gr.Textbox(value="", label="HF Dataset repo (e.g. yourname/gutenberg_dialogue_v1)")
52
+ private_toggle = gr.Checkbox(value=True, label="Private repo")
53
+ btn_push = gr.Button("Push to Hugging Face Hub")
54
+ export_msg = gr.Markdown()
55
+
56
+ with gr.Tab("Settings"):
57
+ instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False)
58
+ gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.")
59
+
60
+ def on_prepare(src_mode, hf_id, upload, sample, min_words, chunk):
61
+ passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, int(sample), int(min_words), int(chunk))
62
+ SESSION["passages"] = passages
63
+ SESSION["dataset_id"] = dataset_id
64
+ SESSION["records"] = []
65
+ return f"Prepared {len(passages)} passages from: {dataset_id}"
66
+
67
+ def on_generate(model_name, temperature):
68
+ if not SESSION["passages"]:
69
+ return "No passages prepared yet.", []
70
+ os.environ["OPENAI_MODEL"] = model_name
71
+ rows, records, ok, bad = [], [], 0, 0
72
+ for i, p in enumerate(SESSION["passages"]):
73
+ y = call_teacher(p, temperature=float(temperature))
74
+ status = "unreviewed"
75
+ if y and validate_output(y):
76
+ ok += 1
77
+ else:
78
+ bad += 1
79
+ y = y or ""
80
+ status = "needs_work"
81
+ rec = {
82
+ "task": "dialogue_format",
83
+ "instruction": INSTRUCTION,
84
+ "input": p,
85
+ "output": y,
86
+ "meta": {
87
+ "chars": len(p),
88
+ "model": os.getenv("OPENAI_MODEL", model_name),
89
+ "status": status,
90
+ "source": "LLM",
91
+ "dataset_id": SESSION["dataset_id"]
92
+ }
93
+ }
94
+ records.append(rec)
95
+ rows.append([i, status, len(p)])
96
+ SESSION["records"] = records
97
+ return f"Generated {ok} valid, {bad} need work.", rows
98
+
99
+ def on_load(idx):
100
+ i = int(idx)
101
+ r = SESSION["records"][i]
102
+ return r["input"], r["output"], r["meta"]["status"]
103
+
104
+ def on_save(idx, output, status):
105
+ i = int(idx)
106
+ SESSION["records"][i]["output"] = output
107
+ SESSION["records"][i]["meta"]["status"] = status
108
+ return f"Saved record #{i} as {status}."
109
+
110
+ def on_export_jsonl():
111
+ path = "workspace/dataset.jsonl"
112
+ to_jsonl(SESSION["records"], path)
113
+ return path
114
+
115
+ def on_push(push_repo, private_toggle):
116
+ if not push_repo:
117
+ return "Provide a repo name like 'yourname/gutenberg_dialogue_v1'"
118
+ ds = to_hf_dataset(
119
+ SESSION["records"],
120
+ save_to="workspace/hf_dataset",
121
+ push_repo=push_repo,
122
+ private=bool(private_toggle),
123
+ token=os.getenv("HF_TOKEN")
124
+ )
125
+ return f"Pushed {len(ds)} records to {push_repo}"
126
+
127
+ btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk], [info_data])
128
+ btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table])
129
+ btn_load.click(on_load, [idx], [inp, out, status])
130
+ btn_save.click(on_save, [idx, out, status], [review_msg])
131
+ btn_jsonl.click(on_export_jsonl, [], [dl_path])
132
+ btn_push.click(on_push, [push_repo, private_toggle], [export_msg])
133
+
134
+ if __name__ == "__main__":
135
+ demo.launch()
data_io.py CHANGED
@@ -1 +1,57 @@
1
- Placeholder for data_io.py. Replace with actual code from spec.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from ftfy import fix_text
3
+ import regex as re
4
+ from typing import List, Tuple
5
+
6
+ DEF_CHUNK = 1200
7
+
8
+ def ascii_quotes(s: str) -> str:
9
+ return (s.replace("“","\"").replace("”","\"")
10
+ .replace("‘","'").replace("’","'")
11
+ .replace("«","\"").replace("»","\""))
12
+
13
+ def split_passages(text: str, max_chars: int = DEF_CHUNK) -> List[str]:
14
+ paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
15
+ buf, out = "", []
16
+ for p in paras:
17
+ if len(buf) + len(p) + 2 <= max_chars:
18
+ buf = f"{buf}\n\n{p}".strip() if buf else p
19
+ else:
20
+ if buf: out.append(buf)
21
+ buf = p
22
+ if buf: out.append(buf)
23
+ return out
24
+
25
+ def load_from_hub_or_upload(src_mode: str, dataset_id: str, upload_file, sample: int, min_words: int, chunk: int) -> Tuple[List[str], str]:
26
+ passages: List[str] = []
27
+ actual_id = None
28
+ if src_mode == "HF Dataset":
29
+ ds = load_dataset(dataset_id, split="train")
30
+ for ex in ds:
31
+ raw = ex.get("text", "") or ""
32
+ if not raw.strip():
33
+ continue
34
+ tx = ascii_quotes(fix_text(raw)).strip()
35
+ for p in split_passages(tx, max_chars=int(chunk)):
36
+ if len(p.split()) < int(min_words):
37
+ continue
38
+ passages.append(p)
39
+ if sample and len(passages) >= int(sample):
40
+ break
41
+ if sample and len(passages) >= int(sample):
42
+ break
43
+ actual_id = dataset_id
44
+ else:
45
+ if upload_file is None:
46
+ return [], "(no upload)"
47
+ content = upload_file.read().decode("utf-8", errors="ignore")
48
+ tx = ascii_quotes(fix_text(content)).strip()
49
+ for p in split_passages(tx, max_chars=int(chunk)):
50
+ if len(p.split()) < int(min_words):
51
+ continue
52
+ passages.append(p)
53
+ if sample and len(passages) >= int(sample):
54
+ break
55
+ actual_id = getattr(upload_file, 'name', 'upload.txt')
56
+
57
+ return passages, actual_id
exporters.py CHANGED
@@ -1 +1,19 @@
1
- Placeholder for exporters.py. Replace with actual code from spec.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ from typing import List, Dict, Any, Optional
3
+ from datasets import Dataset
4
+
5
+ def to_jsonl(records: List[Dict[str, Any]], path: str) -> None:
6
+ os.makedirs(os.path.dirname(path), exist_ok=True)
7
+ with open(path, "w", encoding="utf-8") as f:
8
+ for r in records:
9
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
10
+
11
+ def to_hf_dataset(records: List[Dict[str, Any]], save_to: Optional[str] = None,
12
+ push_repo: Optional[str] = None, private: bool = True, token: Optional[str] = None):
13
+ ds = Dataset.from_list(records)
14
+ if save_to:
15
+ os.makedirs(save_to, exist_ok=True)
16
+ ds.save_to_disk(save_to)
17
+ if push_repo:
18
+ ds.push_to_hub(push_repo, private=private, token=token)
19
+ return ds
requirements.txt CHANGED
@@ -1 +1,8 @@
1
- Placeholder for requirements.txt. Replace with actual code from spec.
 
 
 
 
 
 
 
 
1
+ gradio>=4.44.0
2
+ datasets>=3.0.0
3
+ ftfy
4
+ regex
5
+ openai>=1.40.0
6
+ pydantic
7
+ pandas
8
+ orjson
teacher.py CHANGED
@@ -1 +1,41 @@
1
- Placeholder for teacher.py. Replace with actual code from spec.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, time
2
+ from typing import Optional
3
+ from openai import OpenAI
4
+
5
+ INSTRUCTION = """You are a dialogue structuring assistant for multi-speaker TTS.
6
+
7
+ Map characters to speakers dynamically within each passage (first distinct speaker you detect -> Speaker 1, second -> Speaker 2, etc.).
8
+
9
+ Requirements:
10
+ - Detect speaker changes from context (“said/replied/asked/…”).
11
+ - Output lines strictly as:
12
+ Speaker 1: …
13
+ Speaker 2: …
14
+ (and so on)
15
+ - Label narration (non-dialogue) as Speaker 1.
16
+ - Remove dialogue attribution tags (e.g., “he said”), EXCEPT when the narrator speaks in first person; keep those inline (e.g., “I said”).
17
+ - Preserve original order and content; no omissions or rewrites.
18
+ - Return only the formatted lines, no extra commentary.
19
+ """
20
+
21
+ MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
22
+ client = OpenAI()
23
+ STRICT_SUFFIX = "\n\nIMPORTANT: Every line must start with 'Speaker N: ' and include at least two lines."
24
+
25
+ def call_teacher(passage: str, temperature: float = 0.0, max_retries: int = 2) -> Optional[str]:
26
+ model = os.getenv("OPENAI_MODEL", MODEL)
27
+ prompt = f"{INSTRUCTION}\n\nText:\n{passage}"
28
+ for i in range(max_retries + 1):
29
+ try:
30
+ resp = client.responses.create(
31
+ model=model,
32
+ input=prompt,
33
+ temperature=temperature,
34
+ )
35
+ out = resp.output_text
36
+ if out and out.strip():
37
+ return out
38
+ except Exception:
39
+ time.sleep(0.5 * (i + 1))
40
+ prompt = prompt + STRICT_SUFFIX
41
+ return None
validators.py CHANGED
@@ -1 +1,19 @@
1
- Placeholder for validators.py. Replace with actual code from spec.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import regex as re
2
+ SPEAKER_LINE = re.compile(r"^(Speaker\s+\d+):\s")
3
+
4
+ def validate_output(text: str, min_lines: int = 2, max_speaker_index: int = 9) -> bool:
5
+ if not text:
6
+ return False
7
+ lines = [ln for ln in text.splitlines() if ln.strip()]
8
+ if len(lines) < min_lines:
9
+ return False
10
+ if not all(SPEAKER_LINE.match(ln) for ln in lines):
11
+ return False
12
+ for ln in lines:
13
+ try:
14
+ num = int(ln.split(":")[0].split()[1])
15
+ if num > max_speaker_index:
16
+ return False
17
+ except Exception:
18
+ return False
19
+ return True