Spaces:
Build error
Build error
| import json | |
| import gradio as gr | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from gradio_huggingfacehub_search import HuggingfaceHubSearch | |
| from src.distilabel_dataset_generator.utils import get_org_dropdown | |
| def get_iframe(hub_repo_id) -> str: | |
| if not hub_repo_id: | |
| raise gr.Error("Hub repo id is required") | |
| url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer" | |
| iframe = f""" | |
| <iframe | |
| src="{url}" | |
| frameborder="0" | |
| width="100%" | |
| height="600px" | |
| ></iframe> | |
| """ | |
| return iframe | |
| def get_valid_columns(df: pd.DataFrame): | |
| valid_columns = [] | |
| for col in df.columns: | |
| sample_val = df[col].iloc[0] | |
| if isinstance(sample_val, str) or ( | |
| isinstance(sample_val, list) | |
| and all(isinstance(item, dict) for item in sample_val) | |
| ): | |
| valid_columns.append(col) | |
| return valid_columns | |
| def load_dataset_from_hub(hub_repo_id: str, n_rows: int = 10): | |
| gr.Info(message="Loading dataset ...") | |
| if not hub_repo_id: | |
| raise gr.Error("Hub repo id is required") | |
| ds_dict = load_dataset(hub_repo_id) | |
| splits = list(ds_dict.keys()) | |
| ds = ds_dict[splits[0]] | |
| if n_rows: | |
| ds = ds.select(range(n_rows)) | |
| df = ds.to_pandas() | |
| # Get columns that contain either strings or lists of dictionaries | |
| valid_columns = get_valid_columns(df) | |
| return ( | |
| df, | |
| gr.Dropdown(choices=valid_columns, label="Instruction Column"), | |
| gr.Dropdown(choices=valid_columns, label="Instruction Column"), | |
| gr.Dropdown(choices=valid_columns, label="Response Column"), | |
| ) | |
| def define_evaluation_aspects(task_type: str): | |
| if task_type == "instruction": | |
| return gr.Dropdown( | |
| value=["overall-rating"], | |
| choices=["complexity", "quality"], | |
| label="Evaluation Aspects", | |
| multiselect=True, | |
| interactive=True, | |
| ) | |
| elif task_type == "instruction-response": | |
| return gr.Dropdown( | |
| value=["overall-rating"], | |
| choices=["helpfulness", "truthfulness", "overall-rating", "honesty"], | |
| label="Evaluation Aspects", | |
| multiselect=True, | |
| interactive=True, | |
| ) | |
| else: | |
| return gr.Dropdown(interactive=False) | |
| def evaluate_instruction(df: pd.DataFrame, aspects: list[str], instruction_column: str): | |
| pass | |
| def evaluate_instruction_response( | |
| df: pd.DataFrame, aspects: list[str], instruction_column: str, response_column: str | |
| ): | |
| pass | |
| def evaluate_custom( | |
| df: pd.DataFrame, aspects: list[str], prompt_template: str, structured_output: dict | |
| ): | |
| pass | |
| def _apply_to_dataset( | |
| df: pd.DataFrame, | |
| eval_type: str, | |
| aspects_instruction: list[str], | |
| instruction_column: str, | |
| aspects_instruction_response: list[str], | |
| instruction_column_response: str, | |
| response_column_response: str, | |
| aspects_custom: list[str], | |
| prompt_template: str, | |
| structured_output: dict, | |
| ): | |
| if eval_type == "instruction": | |
| df = evaluate_instruction(df, aspects_instruction, instruction_column) | |
| elif eval_type == "instruction-response": | |
| df = evaluate_instruction_response( | |
| df, | |
| aspects_instruction_response, | |
| instruction_column_response, | |
| response_column_response, | |
| ) | |
| elif eval_type == "custom": | |
| df = evaluate_custom(df, aspects_custom, prompt_template, structured_output) | |
| return df | |
| def apply_to_sample_dataset( | |
| repo_id: str, | |
| eval_type: str, | |
| aspects_instruction: list[str], | |
| aspects_instruction_response: list[str], | |
| aspects_custom: list[str], | |
| instruction_instruction: str, | |
| instruction_instruction_response: str, | |
| response_instruction_response: str, | |
| prompt_template: str, | |
| structured_output: dict, | |
| ): | |
| df, _, _, _ = load_dataset_from_hub(repo_id, n_rows=10) | |
| df = _apply_to_dataset( | |
| df, | |
| eval_type, | |
| aspects_instruction, | |
| instruction_instruction, | |
| aspects_instruction_response, | |
| instruction_instruction_response, | |
| response_instruction_response, | |
| aspects_custom, | |
| prompt_template, | |
| structured_output, | |
| ) | |
| return df | |
| def push_to_hub( | |
| org_name: str, | |
| repo_name: str, | |
| private: bool, | |
| n_rows: int, | |
| original_repo_id: str, | |
| eval_type: str, | |
| aspects_instruction: list[str], | |
| aspects_instruction_response: list[str], | |
| aspects_custom: list[str], | |
| instruction_instruction: str, | |
| instruction_instruction_response: str, | |
| response_instruction_response: str, | |
| prompt_template: str, | |
| structured_output: dict, | |
| ): | |
| df, _, _, _ = load_dataset_from_hub(original_repo_id, n_rows=n_rows) | |
| df = _apply_to_dataset( | |
| df, | |
| eval_type, | |
| aspects_instruction, | |
| instruction_instruction, | |
| aspects_instruction_response, | |
| instruction_instruction_response, | |
| response_instruction_response, | |
| aspects_custom, | |
| prompt_template, | |
| structured_output, | |
| ) | |
| new_repo_id = f"{org_name}/{repo_name}" | |
| print(df) | |
| with gr.Blocks() as app: | |
| gr.Markdown("## Select your input dataset") | |
| gr.HTML("<hr>") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| search_in = HuggingfaceHubSearch( | |
| label="Search", | |
| placeholder="Search for a Dataset", | |
| search_type="dataset", | |
| sumbit_on_select=True, | |
| ) | |
| load_btn = gr.Button("Load Dataset") | |
| with gr.Column(scale=3): | |
| search_out = gr.HTML(label="Dataset Preview") | |
| gr.Markdown("## Configure your task") | |
| gr.HTML("<hr>") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| eval_type = gr.Dropdown( | |
| label="Evaluation Type", | |
| choices=["instruction", "instruction-response", "custom"], | |
| visible=False, | |
| ) | |
| with gr.Tab("instruction") as tab_instruction: | |
| aspects_instruction = define_evaluation_aspects("instruction") | |
| instruction_instruction = gr.Dropdown( | |
| label="Instruction Column", interactive=True | |
| ) | |
| tab_instruction.select( | |
| lambda: "instruction", | |
| inputs=[], | |
| outputs=[eval_type], | |
| ) | |
| with gr.Tab("instruction-response") as tab_instruction_response: | |
| aspects_instruction_response = define_evaluation_aspects( | |
| "instruction-response" | |
| ) | |
| instruction_instruction_response = gr.Dropdown( | |
| label="Instruction Column", interactive=True | |
| ) | |
| response_instruction_response = gr.Dropdown( | |
| label="Response Column", interactive=True | |
| ) | |
| tab_instruction_response.select( | |
| lambda: "instruction-response", | |
| inputs=[], | |
| outputs=[eval_type], | |
| ) | |
| with gr.Tab("custom") as tab_custom: | |
| aspects_custom = define_evaluation_aspects("custom") | |
| prompt_template = gr.Code( | |
| label="Prompt Template", | |
| value="{{column_1}} based on {{column_2}}", | |
| language="markdown", | |
| interactive=True, | |
| ) | |
| structured_output = gr.Code( | |
| label="Structured Output", | |
| value=json.dumps({"eval_aspect": "str"}), | |
| language="json", | |
| interactive=True, | |
| ) | |
| tab_custom.select( | |
| lambda: "custom", | |
| inputs=[], | |
| outputs=[eval_type], | |
| ) | |
| btn_apply_to_sample_dataset = gr.Button("Refresh dataset") | |
| with gr.Column(scale=3): | |
| dataframe = gr.Dataframe() | |
| gr.Markdown("## Generate your dataset") | |
| gr.HTML("<hr>") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| org_name = get_org_dropdown() | |
| repo_name = gr.Textbox( | |
| label="Repo name", | |
| placeholder="dataset_name", | |
| value="my-distiset", | |
| interactive=True, | |
| ) | |
| n_rows = gr.Number( | |
| label="Number of rows", | |
| value=10, | |
| interactive=True, | |
| scale=1, | |
| ) | |
| private = gr.Checkbox( | |
| label="Private dataset", | |
| value=False, | |
| interactive=True, | |
| scale=1, | |
| ) | |
| btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2) | |
| with gr.Column(scale=3): | |
| success_message = gr.Markdown(visible=False) | |
| search_in.submit(get_iframe, inputs=search_in, outputs=search_out) | |
| load_btn.click( | |
| load_dataset_from_hub, | |
| inputs=[search_in], | |
| outputs=[ | |
| dataframe, | |
| instruction_instruction, | |
| instruction_instruction_response, | |
| response_instruction_response, | |
| ], | |
| ) | |
| btn_apply_to_sample_dataset.click( | |
| apply_to_sample_dataset, | |
| inputs=[ | |
| search_in, | |
| eval_type, | |
| aspects_instruction, | |
| aspects_instruction_response, | |
| aspects_custom, | |
| instruction_instruction, | |
| instruction_instruction_response, | |
| response_instruction_response, | |
| prompt_template, | |
| structured_output, | |
| ], | |
| outputs=dataframe, | |
| ) | |
| btn_push_to_hub.click( | |
| push_to_hub, | |
| inputs=[ | |
| org_name, | |
| repo_name, | |
| private, | |
| n_rows, | |
| search_in, | |
| eval_type, | |
| aspects_instruction, | |
| aspects_instruction_response, | |
| aspects_custom, | |
| instruction_instruction, | |
| instruction_instruction_response, | |
| response_instruction_response, | |
| prompt_template, | |
| structured_output, | |
| ], | |
| outputs=success_message, | |
| ) | |
| app.load(fn=get_org_dropdown, outputs=[org_name]) | |