Spaces:

Mfischthal
/

DialogueExtractor

Sleeping

App Files Files Community

Mfischthal commited on Oct 17, 2025

Commit

f432fa9

verified ·

1 Parent(s): e5560f2

Upload 7 files

Browse files

Files changed (7) hide show

README.md +11 -1
app.py +135 -1
data_io.py +57 -1
exporters.py +19 -1
requirements.txt +8 -1
teacher.py +41 -1
validators.py +19 -1

README.md CHANGED Viewed

	@@ -1 +1,11 @@
1	- ~~Placeholder~~ ~~for~~ ~~README.md.~~ ~~Replace~~ ~~with~~ ~~actual code from spec.~~

+# Dialogue→Speaker Dataset Builder (HF Spaces)
+A GUI app (Gradio) that prepares text passages, calls the OpenAI API to structure dialogue into `Speaker N:` lines, lets you review & edit, and exports JSONL or a HF Dataset.
+## Quickstart (HF Spaces)
+1. Create a new Space → SDK: **Gradio**.
+2. Add **Secrets**:
+   - `OPENAI_API_KEY` (required)
+   - `OPENAI_MODEL` (optional, default `gpt-4o-mini`)
+   - `HF_TOKEN` (optional, for push_to_hub)
+3. Upload all these files.
+4. Launch the Space.

app.py CHANGED Viewed

	@@ -1 +1,135 @@
1	- ~~Placeholder~~ ~~for app.py. Replace with actual code from spec.~~

+import os
+import gradio as gr
+from typing import List, Dict, Any
+from data_io import load_from_hub_or_upload
+from teacher import call_teacher, MODEL, INSTRUCTION
+from validators import validate_output
+from exporters import to_jsonl, to_hf_dataset
+SESSION: Dict[str, Any] = {
+    "passages": [],
+    "records": [],
+    "dataset_id": None,
+}
+DESCRIPTION = """### Dialogue→Speaker Dataset Builder
+A Gradio app that prepares passages, generates `Speaker N:`-structured dialogue via the OpenAI API, lets you review & edit, and exports JSONL / HF Datasets."""
+with gr.Blocks(title="Dialogue→Speaker Dataset Builder") as demo:
+    gr.Markdown("# Dialogue→Speaker Dataset Builder")
+    gr.Markdown(DESCRIPTION)
+    with gr.Tab("Data"):
+        src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source")
+        hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)")
+        upload = gr.File(file_types=[".txt"], label="Upload a .txt file")
+        sample = gr.Number(value=200, label="Sample passages (0 = all)")
+        min_words = gr.Number(value=80, label="Min words per passage")
+        chunk = gr.Number(value=1200, label="Chunk size (chars)")
+        btn_prep = gr.Button("Prepare passages")
+        info_data = gr.Markdown()
+    with gr.Tab("Generation"):
+        model_box = gr.Textbox(value=os.getenv("OPENAI_MODEL", MODEL), label="OpenAI model")
+        temperature = gr.Slider(0, 1, value=0.0, step=0.1, label="Temperature")
+        btn_gen = gr.Button("Generate with OpenAI")
+        progress_gen = gr.Markdown()
+        rec_table = gr.Dataframe(headers=["#", "status", "chars"], row_count=(0, "dynamic"))
+    with gr.Tab("Review"):
+        idx = gr.Number(value=0, label="Record #")
+        inp = gr.Textbox(lines=12, label="Input passage", interactive=False)
+        out = gr.Textbox(lines=12, label="Output (edit)")
+        status = gr.Dropdown(["accepted","needs_work","unreviewed"], value="unreviewed", label="Status")
+        btn_load = gr.Button("Load record")
+        btn_save = gr.Button("Save changes")
+        review_msg = gr.Markdown()
+    with gr.Tab("Export"):
+        btn_jsonl = gr.Button("Download JSONL")
+        dl_path = gr.Textbox(label="JSONL path")
+        push_repo = gr.Textbox(value="", label="HF Dataset repo (e.g. yourname/gutenberg_dialogue_v1)")
+        private_toggle = gr.Checkbox(value=True, label="Private repo")
+        btn_push = gr.Button("Push to Hugging Face Hub")
+        export_msg = gr.Markdown()
+    with gr.Tab("Settings"):
+        instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False)
+        gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.")
+    def on_prepare(src_mode, hf_id, upload, sample, min_words, chunk):
+        passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, int(sample), int(min_words), int(chunk))
+        SESSION["passages"] = passages
+        SESSION["dataset_id"] = dataset_id
+        SESSION["records"] = []
+        return f"Prepared {len(passages)} passages from: {dataset_id}"
+    def on_generate(model_name, temperature):
+        if not SESSION["passages"]:
+            return "No passages prepared yet.", []
+        os.environ["OPENAI_MODEL"] = model_name
+        rows, records, ok, bad = [], [], 0, 0
+        for i, p in enumerate(SESSION["passages"]):
+            y = call_teacher(p, temperature=float(temperature))
+            status = "unreviewed"
+            if y and validate_output(y):
+                ok += 1
+            else:
+                bad += 1
+                y = y or ""
+                status = "needs_work"
+            rec = {
+                "task": "dialogue_format",
+                "instruction": INSTRUCTION,
+                "input": p,
+                "output": y,
+                "meta": {
+                    "chars": len(p),
+                    "model": os.getenv("OPENAI_MODEL", model_name),
+                    "status": status,
+                    "source": "LLM",
+                    "dataset_id": SESSION["dataset_id"]
+                }
+            }
+            records.append(rec)
+            rows.append([i, status, len(p)])
+        SESSION["records"] = records
+        return f"Generated {ok} valid, {bad} need work.", rows
+    def on_load(idx):
+        i = int(idx)
+        r = SESSION["records"][i]
+        return r["input"], r["output"], r["meta"]["status"]
+    def on_save(idx, output, status):
+        i = int(idx)
+        SESSION["records"][i]["output"] = output
+        SESSION["records"][i]["meta"]["status"] = status
+        return f"Saved record #{i} as {status}."
+    def on_export_jsonl():
+        path = "workspace/dataset.jsonl"
+        to_jsonl(SESSION["records"], path)
+        return path
+    def on_push(push_repo, private_toggle):
+        if not push_repo:
+            return "Provide a repo name like 'yourname/gutenberg_dialogue_v1'"
+        ds = to_hf_dataset(
+            SESSION["records"],
+            save_to="workspace/hf_dataset",
+            push_repo=push_repo,
+            private=bool(private_toggle),
+            token=os.getenv("HF_TOKEN")
+        )
+        return f"Pushed {len(ds)} records to {push_repo}"
+    btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk], [info_data])
+    btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table])
+    btn_load.click(on_load, [idx], [inp, out, status])
+    btn_save.click(on_save, [idx, out, status], [review_msg])
+    btn_jsonl.click(on_export_jsonl, [], [dl_path])
+    btn_push.click(on_push, [push_repo, private_toggle], [export_msg])
+if __name__ == "__main__":
+    demo.launch()

data_io.py CHANGED Viewed

	@@ -1 +1,57 @@
1	- ~~Placeholder~~ ~~for~~ ~~data_io.py.~~ ~~Replace with actual code from spec.~~

+from datasets import load_dataset
+from ftfy import fix_text
+import regex as re
+from typing import List, Tuple
+DEF_CHUNK = 1200
+def ascii_quotes(s: str) -> str:
+    return (s.replace("“","\"").replace("”","\"")
+            .replace("‘","'").replace("’","'")
+            .replace("«","\"").replace("»","\""))
+def split_passages(text: str, max_chars: int = DEF_CHUNK) -> List[str]:
+    paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
+    buf, out = "", []
+    for p in paras:
+        if len(buf) + len(p) + 2 <= max_chars:
+            buf = f"{buf}\n\n{p}".strip() if buf else p
+        else:
+            if buf: out.append(buf)
+            buf = p
+    if buf: out.append(buf)
+    return out
+def load_from_hub_or_upload(src_mode: str, dataset_id: str, upload_file, sample: int, min_words: int, chunk: int) -> Tuple[List[str], str]:
+    passages: List[str] = []
+    actual_id = None
+    if src_mode == "HF Dataset":
+        ds = load_dataset(dataset_id, split="train")
+        for ex in ds:
+            raw = ex.get("text", "") or ""
+            if not raw.strip():
+                continue
+            tx = ascii_quotes(fix_text(raw)).strip()
+            for p in split_passages(tx, max_chars=int(chunk)):
+                if len(p.split()) < int(min_words):
+                    continue
+                passages.append(p)
+                if sample and len(passages) >= int(sample):
+                    break
+            if sample and len(passages) >= int(sample):
+                break
+        actual_id = dataset_id
+    else:
+        if upload_file is None:
+            return [], "(no upload)"
+        content = upload_file.read().decode("utf-8", errors="ignore")
+        tx = ascii_quotes(fix_text(content)).strip()
+        for p in split_passages(tx, max_chars=int(chunk)):
+            if len(p.split()) < int(min_words):
+                continue
+            passages.append(p)
+            if sample and len(passages) >= int(sample):
+                break
+        actual_id = getattr(upload_file, 'name', 'upload.txt')
+    return passages, actual_id

exporters.py CHANGED Viewed

	@@ -1 +1,19 @@
1	- ~~Placeholder~~ ~~for~~ ~~exporters.py. Replace with actual code from spec.~~

+import os, json
+from typing import List, Dict, Any, Optional
+from datasets import Dataset
+def to_jsonl(records: List[Dict[str, Any]], path: str) -> None:
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        for r in records:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+def to_hf_dataset(records: List[Dict[str, Any]], save_to: Optional[str] = None,
+                  push_repo: Optional[str] = None, private: bool = True, token: Optional[str] = None):
+    ds = Dataset.from_list(records)
+    if save_to:
+        os.makedirs(save_to, exist_ok=True)
+        ds.save_to_disk(save_to)
+    if push_repo:
+        ds.push_to_hub(push_repo, private=private, token=token)
+    return ds

requirements.txt CHANGED Viewed

	@@ -1 +1,8 @@
1	- ~~Placeholder for requirements~~.~~txt~~. ~~Replace with actual code from spec.~~

+gradio>=4.44.0
+datasets>=3.0.0
+ftfy
+regex
+openai>=1.40.0
+pydantic
+pandas
+orjson

teacher.py CHANGED Viewed

	@@ -1 +1,41 @@
1	- ~~Placeholder~~ ~~for~~ ~~teacher.py. Replace with actual code from spec.~~

+import os, time
+from typing import Optional
+from openai import OpenAI
+INSTRUCTION = """You are a dialogue structuring assistant for multi-speaker TTS.
+Map characters to speakers dynamically within each passage (first distinct speaker you detect -> Speaker 1, second -> Speaker 2, etc.).
+Requirements:
+- Detect speaker changes from context (“said/replied/asked/…”).
+- Output lines strictly as:
+  Speaker 1: …
+  Speaker 2: …
+  (and so on)
+- Label narration (non-dialogue) as Speaker 1.
+- Remove dialogue attribution tags (e.g., “he said”), EXCEPT when the narrator speaks in first person; keep those inline (e.g., “I said”).
+- Preserve original order and content; no omissions or rewrites.
+- Return only the formatted lines, no extra commentary.
+"""
+MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
+client = OpenAI()
+STRICT_SUFFIX = "\n\nIMPORTANT: Every line must start with 'Speaker N: ' and include at least two lines."
+def call_teacher(passage: str, temperature: float = 0.0, max_retries: int = 2) -> Optional[str]:
+    model = os.getenv("OPENAI_MODEL", MODEL)
+    prompt = f"{INSTRUCTION}\n\nText:\n{passage}"
+    for i in range(max_retries + 1):
+        try:
+            resp = client.responses.create(
+                model=model,
+                input=prompt,
+                temperature=temperature,
+            )
+            out = resp.output_text
+            if out and out.strip():
+                return out
+        except Exception:
+            time.sleep(0.5 * (i + 1))
+        prompt = prompt + STRICT_SUFFIX
+    return None

validators.py CHANGED Viewed

	@@ -1 +1,19 @@
1	- ~~Placeholder~~ ~~for~~ ~~validators.py.~~ ~~Replace with actual code from spec.~~

+import regex as re
+SPEAKER_LINE = re.compile(r"^(Speaker\s+\d+):\s")
+def validate_output(text: str, min_lines: int = 2, max_speaker_index: int = 9) -> bool:
+    if not text:
+        return False
+    lines = [ln for ln in text.splitlines() if ln.strip()]
+    if len(lines) < min_lines:
+        return False
+    if not all(SPEAKER_LINE.match(ln) for ln in lines):
+        return False
+    for ln in lines:
+        try:
+            num = int(ln.split(":")[0].split()[1])
+            if num > max_speaker_index:
+                return False
+        except Exception:
+            return False
+    return True