Spaces:
Running
Running
| from functools import partial, lru_cache | |
| import duckdb | |
| import gradio as gr | |
| import pandas as pd | |
| import requests | |
| from huggingface_hub import HfApi | |
| READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet") | |
| EMPTY_DF = pd.DataFrame([{str(i): "" for i in range(4)}] * 10) | |
| MAX_NUM_COLUMNS = 20 | |
| css = """ | |
| @media (prefers-color-scheme: dark) { | |
| .transparent-dropdown, .transparent-dropdown .container .wrap { | |
| background: var(--bg-dark); | |
| } | |
| } | |
| @media (prefers-color-scheme: light) { | |
| .transparent-dropdown, .transparent-dropdown .container .wrap { | |
| background: var(--bg); | |
| } | |
| } | |
| input { | |
| -webkit-user-select: none; | |
| -moz-user-select: none; | |
| -ms-user-select: none; | |
| user-select: none; | |
| } | |
| .cell-menu-button { | |
| z-index: -1; | |
| } | |
| thead { | |
| display: none; | |
| } | |
| """ | |
| js = """ | |
| function setDataFrameReadonly() { | |
| MutationObserver = window.MutationObserver || window.WebKitMutationObserver; | |
| var observer = new MutationObserver(function(mutations, observer) { | |
| // fired when a mutation occurs | |
| document.querySelectorAll('.readonly-dataframe div .table-wrap button svelte-virtual-table-viewport table tbody tr td .cell-wrap input').forEach(i => i.setAttribute("readonly", "true")); | |
| }); | |
| // define what element should be observed by the observer | |
| // and what types of mutations trigger the callback | |
| observer.observe(document, { | |
| subtree: true, | |
| childList: true | |
| }); | |
| } | |
| """ | |
| text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t") | |
| def prepare_function(func: str, placeholder: str, column_name: str) -> str: | |
| if "(" in func: | |
| prepared_func = func.split("(") | |
| prepared_func[1] = prepared_func[1].replace(placeholder, column_name, 1) | |
| prepared_func = "(".join(prepared_func) | |
| else: | |
| prepared_func = func.replace(placeholder, column_name, 1) | |
| return prepared_func | |
| with gr.Blocks(css=css, js=js) as demo: | |
| loading_codes_json = gr.JSON(visible=False) | |
| dataset_subset_split_textbox = gr.Textbox(visible=False) | |
| input_dataframe = gr.DataFrame(visible=False) | |
| with gr.Group(): | |
| with gr.Row(): | |
| dataset_dropdown = gr.Dropdown(label="Open Dataset", allow_custom_value=True, scale=10) | |
| subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown") | |
| split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown") | |
| gr.LoginButton() | |
| with gr.Row(): | |
| transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in EMPTY_DF.columns] | |
| transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))] | |
| dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe") | |
| def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None): | |
| api = HfApi(token=oauth_token.token if oauth_token else None) | |
| datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"])) | |
| if oauth_token and (user := api.whoami().get("user")): | |
| datasets += list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user)) | |
| dataset = request.query_params.get("dataset") or datasets[0].id | |
| return {dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset)} | |
| def _fetch_read_parquet_loading(dataset: str): | |
| if dataset and "/" not in dataset.strip().strip("/"): | |
| return [] | |
| resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json() | |
| return ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or [] | |
| def _show_subset_dropdown(loading_codes: list[dict]): | |
| subsets = [loading_code["config_name"] for loading_code in loading_codes] | |
| subset = (subsets or [""])[0] | |
| splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0] | |
| split = (splits or [""])[0] | |
| return gr.Dropdown(subsets, value=subset, visible=len(subsets) > 1), gr.Dropdown(splits, value=split, visible=len(splits) > 1) | |
| def _show_split_dropdown(loading_codes: list[dict], subset: str): | |
| splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0] | |
| split = (splits or [""])[0] | |
| return gr.Dropdown(splits, value=split, visible=len(splits) > 1) | |
| def _set_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame: | |
| pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0] | |
| if dataset and subset and split and pattern: | |
| df = duckdb.sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").df() | |
| return gr.DataFrame(df, column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns)) | |
| else: | |
| return gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns)) | |
| def _set_transforms(input_df: pd.DataFrame): | |
| new_transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in input_df.columns] | |
| new_transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))] | |
| return new_transform_dropdowns | |
| def _set_dataframe(input_df: pd.DataFrame, *transforms: tuple[str], column_index: int): | |
| try: | |
| print(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;") | |
| # return input_df | |
| return duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;") | |
| except Exception as e: | |
| raise gr.Error(f"{type(e).__name__}: {e}") | |
| for column_index, transform_dropdown in enumerate(transform_dropdowns): | |
| transform_dropdown.change(partial(_set_dataframe, column_index=column_index), inputs=[input_dataframe] + transform_dropdowns, outputs=dataframe) | |
| if __name__ == "__main__": | |
| demo.launch() | |