Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import jsonlines | |
| import os | |
| import uuid | |
| from datetime import datetime | |
| from huggingface_hub import HfApi | |
| from pprint import pprint | |
| datasets = [ | |
| "gutenberg_raw", | |
| "stackexchange2", | |
| "bigcode_python_code", | |
| "bigcode_python_github_issues", | |
| "bigcode_python_jupyter_scripts_dedup_filtered", | |
| "books3", | |
| "c4", | |
| "s2orc_raw", | |
| "reddit_threaded", | |
| "cc_filtered_text", | |
| ] | |
| def line_generator(dataset): | |
| if dataset == "gutenberg_raw": | |
| with jsonlines.open("data/gutenberg_raw_examples_with_stats.json", "r") as f: | |
| for line in f: | |
| yield line | |
| if dataset == "stackexchange2": | |
| with jsonlines.open("data/stackexchange2_examples_with_stats.json", "r") as f: | |
| for line in f: | |
| yield line | |
| if dataset == "bigcode_python_code": | |
| with jsonlines.open( | |
| "data/bigcode_python_code_examples_with_stats.json", "r" | |
| ) as f: | |
| for line in f: | |
| yield line | |
| if dataset == "bigcode_python_github_issues": | |
| with jsonlines.open( | |
| "data/bigcode_python_github_issues_examples_with_stats.json", "r" | |
| ) as f: | |
| for line in f: | |
| yield line | |
| if dataset == "bigcode_python_jupyter_scripts_dedup_filtered": | |
| with jsonlines.open( | |
| "data/bigcode_python_jupyter_scripts_dedup_filtered_examples_with_stats.json", | |
| "r", | |
| ) as f: | |
| for line in f: | |
| yield line | |
| if dataset == "books3": | |
| with jsonlines.open("data/books3_examples_with_stats.json", "r") as f: | |
| for line in f: | |
| yield line | |
| if dataset == "c4": | |
| with jsonlines.open("data/c4_examples_with_stats.json", "r") as f: | |
| for line in f: | |
| yield line | |
| if dataset == "s2orc_raw": | |
| with jsonlines.open("data/s2orc_raw_examples_with_stats.json", "r") as f: | |
| for line in f: | |
| yield line | |
| if dataset == "reddit_threaded": | |
| with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f: | |
| for line in f: | |
| yield line | |
| if dataset == "cc_filtered_text": | |
| with jsonlines.open("data/reddit_threaded_examples_with_stats.json", "r") as f: | |
| for line in f: | |
| yield line | |
| line_generators = {dataset: line_generator(dataset) for dataset in datasets} | |
| def send_report(sample, dataset, reason, annotator, campaign): | |
| text = sample["text"] | |
| sample.pop("text") | |
| sample_id = "" | |
| if "id" not in sample: | |
| if "title" in sample: | |
| sample_id = sample["title"] | |
| else: | |
| sample_id = sample["id"] | |
| print("submitting") | |
| pprint( | |
| { | |
| "dataset": dataset, | |
| "docid": sample_id, | |
| "text": text, | |
| "metadata": sample, | |
| "reason": reason, | |
| "annotator": annotator, | |
| "campaign": campaign, | |
| "timestamp": str(datetime.now()), | |
| } | |
| ) | |
| with jsonlines.open("report.jsonl", "w") as f: | |
| f.write( | |
| { | |
| "dataset": dataset, | |
| "docid": sample_id, | |
| "text": text, | |
| "metadata": sample, | |
| "reason": reason, | |
| "annotator": annotator, | |
| "campaign": campaign, | |
| "timestamp": str(datetime.now()), | |
| } | |
| ) | |
| print("geclm_token", os.environ.get("geclm_token")) | |
| api = HfApi() | |
| api.upload_file( | |
| path_or_fileobj="report.jsonl", | |
| path_in_repo="report-{}.jsonl".format(uuid.uuid4()), | |
| repo_id="HuggingFaceGECLM/data_feedback", | |
| repo_type="dataset", | |
| token=os.environ.get("geclm_token"), | |
| ) | |
| description = """ | |
| GecLM annotations. All annotations are recorded in the [data_feedback](https://huggingface.co/datasets/HuggingFaceGECLM/data_feedback) dataset. | |
| """ | |
| if __name__ == "__main__": | |
| demo = gr.Blocks() | |
| with demo: | |
| current_sample_state = gr.State(dict()) | |
| description = gr.Markdown(value=description) | |
| with gr.Row(): | |
| annotator = gr.Textbox( | |
| lines=1, | |
| max_lines=1, | |
| placeholder="Optionally provide your name here if you'd like it to be recorded.", | |
| label="Annotator", | |
| ) | |
| campaign = gr.Textbox( | |
| lines=1, | |
| max_lines=1, | |
| placeholder="Optionally provide the name of the annotation campagin for ease of filtering the reports.", | |
| label="Annotation campaign", | |
| ) | |
| with gr.Row(): | |
| dataset = gr.Dropdown( | |
| choices=datasets, value="Pick a dataset below", label="Dataset", | |
| ) | |
| with gr.Row(): | |
| reason_txt = gr.Textbox( | |
| label="Flagging reason", | |
| placeholder="Provide the reason for flagging if you think the sample is bad.", | |
| visible=False, | |
| ) | |
| with gr.Row(): | |
| bad_btn = gr.Button("Bad ❌", visible=False) | |
| good_btn = gr.Button("Next ✅", visible=False) | |
| with gr.Row(): | |
| text = gr.Markdown(visible=False) | |
| def next_line(dataset): | |
| next_line = next(line_generators[dataset]) | |
| return [ | |
| gr.update(value="<pre>" + next_line["text"] + "</pre>", visible=True), | |
| next_line, | |
| gr.update(visible=True), | |
| gr.update(visible=True), | |
| gr.update(visible=True), | |
| ] | |
| def bad_line(current_sample, dataset, reason, annotator, campaign): | |
| send_report(current_sample, dataset, reason, annotator, campaign) | |
| next_line = next(line_generators[dataset]) | |
| return [ | |
| "<pre>" + next_line["text"] + "</pre>", | |
| gr.update( | |
| value="", | |
| placeholder="Provide the reason for flagging if you think the sample is bad.", | |
| ), | |
| next_line, | |
| ] | |
| good_btn.click( | |
| next_line, | |
| inputs=dataset, | |
| outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn], | |
| ) | |
| dataset.change( | |
| next_line, | |
| inputs=dataset, | |
| outputs=[text, current_sample_state, reason_txt, good_btn, bad_btn], | |
| ) | |
| bad_btn.click( | |
| bad_line, | |
| inputs=[current_sample_state, dataset, reason_txt, annotator, campaign], | |
| outputs=[text, reason_txt, current_sample_state], | |
| ) | |
| demo.launch(enable_queue=False, debug=True) | |