import os import gradio as gr from typing import List, Dict, Any, Tuple from data_io import load_from_hub_or_upload from teacher import call_teacher, MODEL, INSTRUCTION from validators import validate_output from exporters import to_jsonl, to_hf_dataset SESSION: Dict[str, Any] = { "passages": [], "records": [], "dataset_id": None, } DESCRIPTION = ( "### Dialogue→Speaker Dataset Builder\n" "Prepare passages, generate `Speaker N:` dialogue via the OpenAI API, " "review & edit, and export JSONL / HF Datasets." ) def on_prepare(src_mode: str, hf_id: str, upload, sample: float, min_words: float, chunk: float, quote_pairs: float) -> str: sample_i = int(sample) if sample else 0 min_words_i = int(min_words) if min_words else 80 chunk_i = int(chunk) if chunk else 1200 qpairs_i = int(quote_pairs) if quote_pairs else 0 passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, sample_i, min_words_i, chunk_i, quote_pairs=qpairs_i) SESSION["passages"] = passages SESSION["dataset_id"] = dataset_id SESSION["records"] = [] return f"Prepared {len(passages)} passages from: {dataset_id}" def on_generate(model_name: str, temperature: float) -> Tuple[str, list]: if not SESSION["passages"]: return "No passages prepared yet.", [] os.environ["OPENAI_MODEL"] = model_name rows, records, ok, bad = [], [], 0, 0 for i, p in enumerate(SESSION["passages"]): y = call_teacher(p, temperature=float(temperature)) status = "unreviewed" if y and validate_output(y): ok += 1 else: bad += 1 y = y or "" status = "needs_work" rec = { "task": "dialogue_format", "instruction": INSTRUCTION, "input": p, "output": y, "meta": { "chars": len(p), "model": os.getenv("OPENAI_MODEL", model_name), "status": status, "source": "LLM", "dataset_id": SESSION["dataset_id"] } } records.append(rec) rows.append([i, status, len(p)]) SESSION["records"] = records return f"Generated {ok} valid, {bad} need work.", rows def on_load(idx: float) -> Tuple[str, str, str]: i = int(idx) r = SESSION["records"][i] return r["input"], r["output"], r["meta"]["status"] def on_save(idx: float, output: str, status: str) -> str: i = int(idx) SESSION["records"][i]["output"] = output SESSION["records"][i]["meta"]["status"] = status return f"Saved record #{i} as {status}." def on_export_jsonl() -> str: path = "workspace/dataset.jsonl" to_jsonl(SESSION["records"], path) return path def on_push(push_repo: str, private_toggle: bool) -> str: if not push_repo: return "Provide a repo name like 'yourname/gutenberg_dialogue_v1'" ds = to_hf_dataset( SESSION["records"], save_to="workspace/hf_dataset", push_repo=push_repo, private=bool(private_toggle), token=os.getenv("HF_TOKEN") ) return f"Pushed {len(ds)} records to {push_repo}" def build_ui(): with gr.Blocks(title="Dialogue→Speaker Dataset Builder", theme=gr.themes.Default()) as demo: gr.Markdown("# Dialogue→Speaker Dataset Builder") gr.Markdown(DESCRIPTION) with gr.Tab("Data"): src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source") hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)") upload = gr.File(file_types=[".txt"], label="Upload a .txt file") sample = gr.Number(value=5, label="Sample passages (0 = all)") min_words = gr.Number(value=80, label="Min words per passage") chunk = gr.Number(value=1200, label="Chunk size (chars)") quote_pairs = gr.Number(value=1, label="Min dialogue quote-pairs (0 = no filter)") btn_prep = gr.Button("Prepare passages") info_data = gr.Markdown() with gr.Tab("Generation"): model_box = gr.Textbox(value=os.getenv("OPENAI_MODEL", MODEL), label="OpenAI model") temperature = gr.Slider(0, 1, value=0.0, step=0.1, label="Temperature") btn_gen = gr.Button("Generate with OpenAI") progress_gen = gr.Markdown() rec_table = gr.Dataframe(value=[], headers=["#", "status", "chars"], row_count=0, col_count=3, interactive=False) with gr.Tab("Review"): idx = gr.Number(value=0, label="Record #") inp = gr.Textbox(lines=12, label="Input passage", interactive=False) out = gr.Textbox(lines=12, label="Output (edit)") status = gr.Dropdown(["accepted","needs_work","unreviewed"], value="unreviewed", label="Status") btn_load = gr.Button("Load record") btn_save = gr.Button("Save changes") review_msg = gr.Markdown() with gr.Tab("Export"): btn_jsonl = gr.Button("Download JSONL") dl_path = gr.Textbox(label="JSONL path") push_repo = gr.Textbox(value="", label="HF Dataset repo (e.g. yourname/gutenberg_dialogue_v1)") private_toggle = gr.Checkbox(value=True, label="Private repo") btn_push = gr.Button("Push to Hugging Face Hub") export_msg = gr.Markdown() with gr.Tab("Settings"): instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False) gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.") btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk, quote_pairs], [info_data]) btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table]) btn_load.click(on_load, [idx], [inp, out, status]) btn_save.click(on_save, [idx, out, status], [review_msg]) btn_jsonl.click(on_export_jsonl, [], [dl_path]) btn_push.click(on_push, [push_repo, private_toggle], [export_msg]) return demo demo = build_ui() if __name__ == "__main__": demo.launch()