Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| from typing import List, Dict, Any, Tuple | |
| from data_io import load_from_hub_or_upload | |
| from teacher import call_teacher, MODEL, INSTRUCTION | |
| from validators import validate_output | |
| from exporters import to_jsonl, to_hf_dataset | |
| SESSION: Dict[str, Any] = { | |
| "passages": [], | |
| "records": [], | |
| "dataset_id": None, | |
| } | |
| DESCRIPTION = ( | |
| "### Dialogue→Speaker Dataset Builder\n" | |
| "Prepare passages, generate `Speaker N:` dialogue via the OpenAI API, " | |
| "review & edit, and export JSONL / HF Datasets." | |
| ) | |
| def on_prepare(src_mode: str, hf_id: str, upload, sample: float, min_words: float, chunk: float, quote_pairs: float) -> str: | |
| sample_i = int(sample) if sample else 0 | |
| min_words_i = int(min_words) if min_words else 80 | |
| chunk_i = int(chunk) if chunk else 1200 | |
| qpairs_i = int(quote_pairs) if quote_pairs else 0 | |
| passages, dataset_id = load_from_hub_or_upload(src_mode, hf_id, upload, sample_i, min_words_i, chunk_i, quote_pairs=qpairs_i) | |
| SESSION["passages"] = passages | |
| SESSION["dataset_id"] = dataset_id | |
| SESSION["records"] = [] | |
| return f"Prepared {len(passages)} passages from: {dataset_id}" | |
| def on_generate(model_name: str, temperature: float) -> Tuple[str, list]: | |
| if not SESSION["passages"]: | |
| return "No passages prepared yet.", [] | |
| os.environ["OPENAI_MODEL"] = model_name | |
| rows, records, ok, bad = [], [], 0, 0 | |
| for i, p in enumerate(SESSION["passages"]): | |
| y = call_teacher(p, temperature=float(temperature)) | |
| status = "unreviewed" | |
| if y and validate_output(y): | |
| ok += 1 | |
| else: | |
| bad += 1 | |
| y = y or "" | |
| status = "needs_work" | |
| rec = { | |
| "task": "dialogue_format", | |
| "instruction": INSTRUCTION, | |
| "input": p, | |
| "output": y, | |
| "meta": { | |
| "chars": len(p), | |
| "model": os.getenv("OPENAI_MODEL", model_name), | |
| "status": status, | |
| "source": "LLM", | |
| "dataset_id": SESSION["dataset_id"] | |
| } | |
| } | |
| records.append(rec) | |
| rows.append([i, status, len(p)]) | |
| SESSION["records"] = records | |
| return f"Generated {ok} valid, {bad} need work.", rows | |
| def on_load(idx: float) -> Tuple[str, str, str]: | |
| i = int(idx) | |
| r = SESSION["records"][i] | |
| return r["input"], r["output"], r["meta"]["status"] | |
| def on_save(idx: float, output: str, status: str) -> str: | |
| i = int(idx) | |
| SESSION["records"][i]["output"] = output | |
| SESSION["records"][i]["meta"]["status"] = status | |
| return f"Saved record #{i} as {status}." | |
| def on_export_jsonl() -> str: | |
| path = "workspace/dataset.jsonl" | |
| to_jsonl(SESSION["records"], path) | |
| return path | |
| def on_push(push_repo: str, private_toggle: bool) -> str: | |
| if not push_repo: | |
| return "Provide a repo name like 'yourname/gutenberg_dialogue_v1'" | |
| ds = to_hf_dataset( | |
| SESSION["records"], | |
| save_to="workspace/hf_dataset", | |
| push_repo=push_repo, | |
| private=bool(private_toggle), | |
| token=os.getenv("HF_TOKEN") | |
| ) | |
| return f"Pushed {len(ds)} records to {push_repo}" | |
| def build_ui(): | |
| with gr.Blocks(title="Dialogue→Speaker Dataset Builder", theme=gr.themes.Default()) as demo: | |
| gr.Markdown("# Dialogue→Speaker Dataset Builder") | |
| gr.Markdown(DESCRIPTION) | |
| with gr.Tab("Data"): | |
| src_mode = gr.Radio(["HF Dataset", "Upload .txt"], value="HF Dataset", label="Source") | |
| hf_id = gr.Textbox(value="Navanjana/Gutenberg_books", label="HF dataset id (train split)") | |
| upload = gr.File(file_types=[".txt"], label="Upload a .txt file") | |
| sample = gr.Number(value=5, label="Sample passages (0 = all)") | |
| min_words = gr.Number(value=80, label="Min words per passage") | |
| chunk = gr.Number(value=1200, label="Chunk size (chars)") | |
| quote_pairs = gr.Number(value=1, label="Min dialogue quote-pairs (0 = no filter)") | |
| btn_prep = gr.Button("Prepare passages") | |
| info_data = gr.Markdown() | |
| with gr.Tab("Generation"): | |
| model_box = gr.Textbox(value=os.getenv("OPENAI_MODEL", MODEL), label="OpenAI model") | |
| temperature = gr.Slider(0, 1, value=0.0, step=0.1, label="Temperature") | |
| btn_gen = gr.Button("Generate with OpenAI") | |
| progress_gen = gr.Markdown() | |
| rec_table = gr.Dataframe(value=[], headers=["#", "status", "chars"], row_count=0, col_count=3, interactive=False) | |
| with gr.Tab("Review"): | |
| idx = gr.Number(value=0, label="Record #") | |
| inp = gr.Textbox(lines=12, label="Input passage", interactive=False) | |
| out = gr.Textbox(lines=12, label="Output (edit)") | |
| status = gr.Dropdown(["accepted","needs_work","unreviewed"], value="unreviewed", label="Status") | |
| btn_load = gr.Button("Load record") | |
| btn_save = gr.Button("Save changes") | |
| review_msg = gr.Markdown() | |
| with gr.Tab("Export"): | |
| btn_jsonl = gr.Button("Download JSONL") | |
| dl_path = gr.Textbox(label="JSONL path") | |
| push_repo = gr.Textbox(value="", label="HF Dataset repo (e.g. yourname/gutenberg_dialogue_v1)") | |
| private_toggle = gr.Checkbox(value=True, label="Private repo") | |
| btn_push = gr.Button("Push to Hugging Face Hub") | |
| export_msg = gr.Markdown() | |
| with gr.Tab("Settings"): | |
| instr = gr.Textbox(value=INSTRUCTION, lines=14, label="Canonical instruction (read-only)", interactive=False) | |
| gr.Markdown("Set `OPENAI_API_KEY` & optional `OPENAI_MODEL` in Space Secrets.") | |
| btn_prep.click(on_prepare, [src_mode, hf_id, upload, sample, min_words, chunk, quote_pairs], [info_data]) | |
| btn_gen.click(on_generate, [model_box, temperature], [progress_gen, rec_table]) | |
| btn_load.click(on_load, [idx], [inp, out, status]) | |
| btn_save.click(on_save, [idx, out, status], [review_msg]) | |
| btn_jsonl.click(on_export_jsonl, [], [dl_path]) | |
| btn_push.click(on_push, [push_repo, private_toggle], [export_msg]) | |
| return demo | |
| demo = build_ui() | |
| if __name__ == "__main__": | |
| demo.launch() | |