File size: 6,216 Bytes
d70667c
f432fa9
 
758613a
 
f432fa9
 
 
 
 
 
 
 
 
 
 
758613a
 
 
 
 
 
d70667c
758613a
 
 
d70667c
 
758613a
 
 
 
 
 
 
 
 
d70667c
758613a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f432fa9
758613a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d70667c
758613a
 
d70667c
758613a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d70667c
758613a
 
 
 
 
 
 
 
 
f432fa9
 
d70667c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

import os
import gradio as gr
from typing import List, Dict, Any, Tuple

from data_io import load_from_hub_or_upload
from teacher import call_teacher, MODEL, INSTRUCTION
from validators import validate_output
from exporters import to_jsonl, to_hf_dataset

SESSION: Dict[str, Any] = {
    "passages": [],
    "records": [],
    "dataset_id": None,
}

DESCRIPTION = (
    "### Dialogue→Speaker Dataset Builder\n"
    "Prepare passages, generate `Speaker N:` dialogue via the OpenAI API, "
    "review & edit, and export JSONL / HF Datasets."
)

def on_prepare(src_mode: str, hf_id: str, upload, sample: float, min_words: float, chunk: float, quote_pairs: float) -> str:
    sample_i = int(sample) if sample else 0
    min_words_i = int(min_words) if min_words else 80
    chunk_i = int(chunk) if chunk else 1200
    qpairs_i = int(quote_pairs) if quote_pairs else 0
    passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, sample_i, min_words_i, chunk_i, quote_pairs=qpairs_i)
    SESSION["passages"] = passages
    SESSION["dataset_id"] = dataset_id
    SESSION["records"] = []
    return f"Prepared {len(passages)} passages from: {dataset_id}"

def on_generate(model_name: str, temperature: float) -> Tuple[str, list]:
    if not SESSION["passages"]:
        return "No passages prepared yet.", []
    os.environ["OPENAI_MODEL"] = model_name
    rows, records, ok, bad = [], [], 0, 0
    for i, p in enumerate(SESSION["passages"]):
        y = call_teacher(p, temperature=float(temperature))
        status = "unreviewed"
        if y and validate_output(y):
            ok += 1
        else:
            bad += 1
            y = y or ""
            status = "needs_work"
        rec = {
            "task": "dialogue_format",
            "instruction": INSTRUCTION,
            "input": p,
            "output": y,
            "meta": {
                "chars": len(p),
                "model": os.getenv("OPENAI_MODEL", model_name),
                "status": status,
                "source": "LLM",
                "dataset_id": SESSION["dataset_id"]
            }
        }
        records.append(rec)
        rows.append([i, status, len(p)])
    SESSION["records"] = records
    return f"Generated {ok} valid, {bad} need work.", rows

def on_load(idx: float) -> Tuple[str, str, str]:
    i = int(idx)
    r = SESSION["records"][i]
    return r["input"], r["output"], r["meta"]["status"]

def on_save(idx: float, output: str, status: str) -> str:
    i = int(idx)
    SESSION["records"][i]["output"] = output
    SESSION["records"][i]["meta"]["status"] = status
    return f"Saved record #{i} as {status}."

def on_export_jsonl() -> str:
    path = "workspace/dataset.jsonl"
    to_jsonl(SESSION["records"], path)
    return path

def on_push(push_repo: str, private_toggle: bool) -> str:
    if not push_repo:
        return "Provide a repo name like 'yourname/gutenberg_dialogue_v1'"
    ds = to_hf_dataset(
        SESSION["records"],
        save_to="workspace/hf_dataset",
        push_repo=push_repo,
        private=bool(private_toggle),
        token=os.getenv("HF_TOKEN")
    )
    return f"Pushed {len(ds)} records to {push_repo}"

def build_ui():
    with gr.Blocks(title="Dialogue→Speaker Dataset Builder", theme=gr.themes.Default()) as demo:
        gr.Markdown("# Dialogue→Speaker Dataset Builder")
        gr.Markdown(DESCRIPTION)

        with gr.Tab("Data"):
            src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source")
            hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)")
            upload = gr.File(file_types=[".txt"], label="Upload a .txt file")
            sample = gr.Number(value=5, label="Sample passages (0 = all)")
            min_words = gr.Number(value=80, label="Min words per passage")
            chunk = gr.Number(value=1200, label="Chunk size (chars)")
            quote_pairs = gr.Number(value=1, label="Min dialogue quote-pairs (0 = no filter)")
            btn_prep = gr.Button("Prepare passages")
            info_data = gr.Markdown()

        with gr.Tab("Generation"):
            model_box = gr.Textbox(value=os.getenv("OPENAI_MODEL", MODEL), label="OpenAI model")
            temperature = gr.Slider(0, 1, value=0.0, step=0.1, label="Temperature")
            btn_gen = gr.Button("Generate with OpenAI")
            progress_gen = gr.Markdown()
            rec_table = gr.Dataframe(value=[], headers=["#", "status", "chars"], row_count=0, col_count=3, interactive=False)

        with gr.Tab("Review"):
            idx = gr.Number(value=0, label="Record #")
            inp = gr.Textbox(lines=12, label="Input passage", interactive=False)
            out = gr.Textbox(lines=12, label="Output (edit)")
            status = gr.Dropdown(["accepted","needs_work","unreviewed"], value="unreviewed", label="Status")
            btn_load = gr.Button("Load record")
            btn_save = gr.Button("Save changes")
            review_msg = gr.Markdown()

        with gr.Tab("Export"):
            btn_jsonl = gr.Button("Download JSONL")
            dl_path = gr.Textbox(label="JSONL path")
            push_repo = gr.Textbox(value="", label="HF Dataset repo (e.g. yourname/gutenberg_dialogue_v1)")
            private_toggle = gr.Checkbox(value=True, label="Private repo")
            btn_push = gr.Button("Push to Hugging Face Hub")
            export_msg = gr.Markdown()

        with gr.Tab("Settings"):
            instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False)
            gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.")

        btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk, quote_pairs], [info_data])
        btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table])
        btn_load.click(on_load, [idx], [inp, out, status])
        btn_save.click(on_save, [idx, out, status], [review_msg])
        btn_jsonl.click(on_export_jsonl, [], [dl_path])
        btn_push.click(on_push, [push_repo, private_toggle], [export_msg])

    return demo

demo = build_ui()

if __name__ == "__main__":
    demo.launch()