Spaces:

Mfischthal
/

DialogueExtractor

Sleeping

App Files Files Community

Mfischthal commited on Oct 17, 2025

Commit

d70667c

verified ·

1 Parent(s): 758613a

Upload 2 files

Browse files

Files changed (2) hide show

app.py +9 -13
data_io.py +45 -16

app.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import os
 import gradio as gr
 from typing import List, Dict, Any, Tuple
-# Local imports
 from data_io import load_from_hub_or_upload
 from teacher import call_teacher, MODEL, INSTRUCTION
 from validators import validate_output
 from exporters import to_jsonl, to_hf_dataset
-# ---------------- State ----------------
 SESSION: Dict[str, Any] = {
     "passages": [],
     "records": [],
@@ -21,12 +20,12 @@ DESCRIPTION = (
     "review & edit, and export JSONL / HF Datasets."
 )
-# ---------------- Callbacks ----------------
-def on_prepare(src_mode: str, hf_id: str, upload, sample: float, min_words: float, chunk: float) -> str:
     sample_i = int(sample) if sample else 0
     min_words_i = int(min_words) if min_words else 80
     chunk_i = int(chunk) if chunk else 1200
-    passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, sample_i, min_words_i, chunk_i)
     SESSION["passages"] = passages
     SESSION["dataset_id"] = dataset_id
     SESSION["records"] = []
@@ -36,9 +35,7 @@ def on_generate(model_name: str, temperature: float) -> Tuple[str, list]:
     if not SESSION["passages"]:
         return "No passages prepared yet.", []
     os.environ["OPENAI_MODEL"] = model_name
-    rows = []
-    records = []
-    ok = bad = 0
     for i, p in enumerate(SESSION["passages"]):
         y = call_teacher(p, temperature=float(temperature))
         status = "unreviewed"
@@ -94,7 +91,6 @@ def on_push(push_repo: str, private_toggle: bool) -> str:
     )
     return f"Pushed {len(ds)} records to {push_repo}"
-# ---------------- UI ----------------
 def build_ui():
     with gr.Blocks(title="Dialogue→Speaker Dataset Builder", theme=gr.themes.Default()) as demo:
         gr.Markdown("# Dialogue→Speaker Dataset Builder")
@@ -104,9 +100,10 @@ def build_ui():
             src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source")
             hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)")
             upload = gr.File(file_types=[".txt"], label="Upload a .txt file")
-            sample = gr.Number(value=200, label="Sample passages (0 = all)")
             min_words = gr.Number(value=80, label="Min words per passage")
             chunk = gr.Number(value=1200, label="Chunk size (chars)")
             btn_prep = gr.Button("Prepare passages")
             info_data = gr.Markdown()
@@ -138,8 +135,7 @@ def build_ui():
             instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False)
             gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.")
-        # Wire callbacks
-        btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk], [info_data])
         btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table])
         btn_load.click(on_load, [idx], [inp, out, status])
         btn_save.click(on_save, [idx, out, status], [review_msg])
@@ -151,4 +147,4 @@ def build_ui():
 demo = build_ui()
 if __name__ == "__main__":
-    demo.launch()

 import os
 import gradio as gr
 from typing import List, Dict, Any, Tuple
 from data_io import load_from_hub_or_upload
 from teacher import call_teacher, MODEL, INSTRUCTION
 from validators import validate_output
 from exporters import to_jsonl, to_hf_dataset
 SESSION: Dict[str, Any] = {
     "passages": [],
     "records": [],
     "review & edit, and export JSONL / HF Datasets."
 )
+def on_prepare(src_mode: str, hf_id: str, upload, sample: float, min_words: float, chunk: float, quote_pairs: float) -> str:
     sample_i = int(sample) if sample else 0
     min_words_i = int(min_words) if min_words else 80
     chunk_i = int(chunk) if chunk else 1200
+    qpairs_i = int(quote_pairs) if quote_pairs else 0
+    passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, sample_i, min_words_i, chunk_i, quote_pairs=qpairs_i)
     SESSION["passages"] = passages
     SESSION["dataset_id"] = dataset_id
     SESSION["records"] = []
     if not SESSION["passages"]:
         return "No passages prepared yet.", []
     os.environ["OPENAI_MODEL"] = model_name
+    rows, records, ok, bad = [], [], 0, 0
     for i, p in enumerate(SESSION["passages"]):
         y = call_teacher(p, temperature=float(temperature))
         status = "unreviewed"
     )
     return f"Pushed {len(ds)} records to {push_repo}"
 def build_ui():
     with gr.Blocks(title="Dialogue→Speaker Dataset Builder", theme=gr.themes.Default()) as demo:
         gr.Markdown("# Dialogue→Speaker Dataset Builder")
             src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source")
             hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)")
             upload = gr.File(file_types=[".txt"], label="Upload a .txt file")
+            sample = gr.Number(value=5, label="Sample passages (0 = all)")
             min_words = gr.Number(value=80, label="Min words per passage")
             chunk = gr.Number(value=1200, label="Chunk size (chars)")
+            quote_pairs = gr.Number(value=1, label="Min dialogue quote-pairs (0 = no filter)")
             btn_prep = gr.Button("Prepare passages")
             info_data = gr.Markdown()
             instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False)
             gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.")
+        btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk, quote_pairs], [info_data])
         btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table])
         btn_load.click(on_load, [idx], [inp, out, status])
         btn_save.click(on_save, [idx, out, status], [review_msg])
 demo = build_ui()
 if __name__ == "__main__":
+    demo.launch()

data_io.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from datasets import load_dataset
 from ftfy import fix_text
 import regex as re
-from typing import List, Tuple
 DEF_CHUNK = 1200
 def ascii_quotes(s: str) -> str:
     return (s.replace("“","\"").replace("”","\"")
             .replace("‘","'").replace("’","'")
@@ -22,23 +25,47 @@ def split_passages(text: str, max_chars: int = DEF_CHUNK) -> List[str]:
     if buf: out.append(buf)
     return out
-def load_from_hub_or_upload(src_mode: str, dataset_id: str, upload_file, sample: int, min_words: int, chunk: int) -> Tuple[List[str], str]:
     passages: List[str] = []
     actual_id = None
     if src_mode == "HF Dataset":
-        ds = load_dataset(dataset_id, split="train")
-        for ex in ds:
-            raw = ex.get("text", "") or ""
-            if not raw.strip():
-                continue
-            tx = ascii_quotes(fix_text(raw)).strip()
-            for p in split_passages(tx, max_chars=int(chunk)):
-                if len(p.split()) < int(min_words):
-                    continue
-                passages.append(p)
-                if sample and len(passages) >= int(sample):
-                    break
-            if sample and len(passages) >= int(sample):
                 break
         actual_id = dataset_id
     else:
@@ -49,8 +76,10 @@ def load_from_hub_or_upload(src_mode: str, dataset_id: str, upload_file, sample:
         for p in split_passages(tx, max_chars=int(chunk)):
             if len(p.split()) < int(min_words):
                 continue
             passages.append(p)
-            if sample and len(passages) >= int(sample):
                 break
         actual_id = getattr(upload_file, 'name', 'upload.txt')

 from datasets import load_dataset
 from ftfy import fix_text
 import regex as re
+from typing import List, Tuple, Iterable, Optional
 DEF_CHUNK = 1200
+CANDIDATE_TEXT_FIELDS = ["text", "content", "body", "article", "raw"]
 def ascii_quotes(s: str) -> str:
     return (s.replace("“","\"").replace("”","\"")
             .replace("‘","'").replace("’","'")
     if buf: out.append(buf)
     return out
+def pick_text(example: dict) -> Optional[str]:
+    for key in CANDIDATE_TEXT_FIELDS:
+        val = example.get(key, None)
+        if isinstance(val, str) and val.strip():
+            return val
+    # fallback: find the longest string value
+    strings = [str(v) for v in example.values() if isinstance(v, str)]
+    if strings:
+        return max(strings, key=len)
+    return None
+def has_enough_quotes(passage: str, min_pairs: int = 1) -> bool:
+    # Count double quotes after normalization
+    q = passage.count('"')
+    return (q // 2) >= min_pairs
+def iter_passages_streaming(dataset_id: str, split: str = "train", min_words: int = 80, chunk: int = DEF_CHUNK, quote_pairs: int = 0):
+    """Stream records without downloading full dataset; yields normalized, chunked passages."""
+    ds = load_dataset(dataset_id, split=split, streaming=True)
+    for ex in ds:
+        raw = pick_text(ex) or ""
+        if not raw.strip():
+            continue
+        tx = ascii_quotes(fix_text(raw)).strip()
+        for p in split_passages(tx, max_chars=int(chunk)):
+            if len(p.split()) < int(min_words):
+                continue
+            if quote_pairs and not has_enough_quotes(p, min_pairs=quote_pairs):
+                continue
+            yield p
+def load_from_hub_or_upload(src_mode: str, dataset_id: str, upload_file, sample: int, min_words: int, chunk: int, quote_pairs: int = 0) -> Tuple[List[str], str]:
+    """Return up to `sample` passages; uses streaming for HF datasets to avoid full downloads."""
     passages: List[str] = []
     actual_id = None
+    cap = int(sample) if sample else 0
     if src_mode == "HF Dataset":
+        for p in iter_passages_streaming(dataset_id, split="train", min_words=min_words, chunk=chunk, quote_pairs=quote_pairs):
+            passages.append(p)
+            if cap and len(passages) >= cap:
                 break
         actual_id = dataset_id
     else:
         for p in split_passages(tx, max_chars=int(chunk)):
             if len(p.split()) < int(min_words):
                 continue
+            if quote_pairs and not has_enough_quotes(p, min_pairs=quote_pairs):
+                continue
             passages.append(p)
+            if cap and len(passages) >= cap:
                 break
         actual_id = getattr(upload_file, 'name', 'upload.txt')