| import json |
| import os |
| import shutil |
|
|
| import gradio as gr |
| from dingo.exec import Executor |
| from dingo.io import InputArgs |
|
|
|
|
| def dingo_demo(dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list, model, |
| key, api_url): |
| if not data_format: |
| return 'ValueError: data_format can not be empty, please input.', None |
| if not column_content: |
| return 'ValueError: column_content can not be empty, please input.', None |
| if not rule_list and not prompt_list: |
| return 'ValueError: rule_list and prompt_list can not be empty at the same time.', None |
|
|
| |
| if dataset_source == "hugging_face": |
| if not input_path: |
| return 'ValueError: input_path can not be empty for hugging_face dataset, please input.', None |
| final_input_path = input_path |
| else: |
| if not uploaded_file: |
| return 'ValueError: Please upload a file for local dataset.', None |
| final_input_path = uploaded_file.name |
|
|
| input_data = { |
| "dataset": dataset_source, |
| "input_path": final_input_path, |
| "output_path": "" if dataset_source == 'hugging_face' else os.path.dirname(final_input_path), |
| "save_data": True, |
| "save_raw": True, |
| "data_format": data_format, |
| "column_content": column_content, |
| "custom_config": |
| { |
| "rule_list": rule_list, |
| "prompt_list": prompt_list, |
| "llm_config": |
| { |
| "detect_text_quality_detail": |
| { |
| "model": model, |
| "key": key, |
| "api_url": api_url, |
| } |
| } |
| } |
| } |
| input_args = InputArgs(**input_data) |
| executor = Executor.exec_map["local"](input_args) |
| executor.execute() |
| summary = executor.get_summary().to_dict() |
| detail = executor.get_bad_info_list() |
| new_detail = [] |
| for item in detail: |
| new_detail.append(item.to_raw_dict()) |
| if summary['output_path']: |
| shutil.rmtree(summary['output_path']) |
|
|
| |
| return json.dumps(summary, indent=4), new_detail |
|
|
|
|
| def update_input_components(dataset_source): |
| |
| if dataset_source == "hugging_face": |
| |
| return [ |
| gr.Textbox(visible=True), |
| gr.File(visible=False), |
| ] |
| else: |
| |
| return [ |
| gr.Textbox(visible=False), |
| gr.File(visible=True), |
| ] |
|
|
|
|
| if __name__ == '__main__': |
| rule_options = ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleContentNull', 'RuleContentShort', 'RuleEnterAndSpace', 'RuleOnlyUrl'] |
| prompt_options = ['PromptRepeat', 'PromptContentChaos'] |
|
|
| with open("header.html", "r") as file: |
| header = file.read() |
| with gr.Blocks() as demo: |
| gr.HTML(header) |
| with gr.Row(): |
| with gr.Column(): |
| with gr.Column(): |
| dataset_source = gr.Dropdown( |
| choices=["hugging_face", "local"], |
| value="local", |
| label="dataset [source]" |
| ) |
| input_path = gr.Textbox( |
| value='chupei/format-jsonl', |
| placeholder="please input hugging_face dataset path", |
| label="input_path", |
| visible=False |
| ) |
| uploaded_file = gr.File( |
| label="upload file", |
| visible=True |
| ) |
|
|
| data_format = gr.Dropdown( |
| ["jsonl", "json", "plaintext", "listjson"], |
| label="data_format" |
| ) |
| column_content = gr.Textbox( |
| value="content", |
| placeholder="please input column name of content in dataset", |
| label="column_content" |
| ) |
|
|
| rule_list = gr.CheckboxGroup( |
| choices=rule_options, |
| label="rule_list" |
| ) |
| prompt_list = gr.CheckboxGroup( |
| choices=prompt_options, |
| label="prompt_list" |
| ) |
| model = gr.Textbox( |
| placeholder="If want to use llm, please input model, such as: deepseek-chat", |
| label="model" |
| ) |
| key = gr.Textbox( |
| placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx", |
| label="API KEY" |
| ) |
| api_url = gr.Textbox( |
| placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1", |
| label="API URL" |
| ) |
|
|
| with gr.Row(): |
| submit_single = gr.Button(value="Submit", interactive=True, variant="primary") |
|
|
| with gr.Column(): |
| |
| with gr.Tabs(): |
| with gr.Tab("Result Summary"): |
| summary_output = gr.Textbox(label="summary", max_lines=50) |
| with gr.Tab("Result Detail"): |
| detail_output = gr.JSON(label="detail", max_height=800) |
|
|
| dataset_source.change( |
| fn=update_input_components, |
| inputs=dataset_source, |
| outputs=[input_path, uploaded_file] |
| ) |
|
|
| submit_single.click( |
| fn=dingo_demo, |
| inputs=[dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list, |
| model, key, api_url], |
| outputs=[summary_output, detail_output] |
| ) |
|
|
| |
| demo.launch() |
|
|