Spaces:
Runtime error
Runtime error
polinaeterna
commited on
Commit
Β·
fd7a758
1
Parent(s):
284cae9
add nested texts
Browse files
app.py
CHANGED
|
@@ -107,7 +107,9 @@ def run_quality_check(dataset, config, split, column, batch_size, num_examples):
|
|
| 107 |
logging.info("Data fetched.")
|
| 108 |
|
| 109 |
data_sample = data.sample(num_examples, seed=16) if data.shape[0] > num_examples else data
|
| 110 |
-
texts =
|
|
|
|
|
|
|
| 111 |
predictions, texts_processed = [], []
|
| 112 |
num_examples = min(len(texts), num_examples)
|
| 113 |
for i in range(0, num_examples, batch_size):
|
|
@@ -144,7 +146,7 @@ def plot_toxicity(scores):
|
|
| 144 |
|
| 145 |
return fig
|
| 146 |
|
| 147 |
-
def call_perspective_api(texts_df, column_name, dataset, config, split):#, full_check=False):
|
| 148 |
headers = {
|
| 149 |
"content-type": "application/json",
|
| 150 |
}
|
|
@@ -154,21 +156,23 @@ def call_perspective_api(texts_df, column_name, dataset, config, split):#, full_
|
|
| 154 |
if texts_df.values.tolist() == [['', '', '']]:
|
| 155 |
logging.info(f"Fetching data for {dataset=} {config=} {split=} {column_name=}")
|
| 156 |
try:
|
| 157 |
-
|
| 158 |
-
except
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
| 167 |
logging.info("Data fetched.")
|
| 168 |
texts_df = texts_df.to_pandas()
|
| 169 |
|
| 170 |
-
# texts = texts_df.sample(100, seed=16)[column_name].values if not full_check else texts_df[column_name].values
|
| 171 |
texts = texts_df.sample(100, random_state=16)[column_name].values if texts_df.shape[0] > 100 else texts_df[column_name].values
|
|
|
|
|
|
|
| 172 |
|
| 173 |
n_samples = len(texts)
|
| 174 |
for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
|
|
@@ -245,45 +249,80 @@ with gr.Blocks() as demo:
|
|
| 245 |
"""
|
| 246 |
return gr.HTML(value=html_code)
|
| 247 |
|
| 248 |
-
|
|
|
|
|
|
|
| 249 |
|
| 250 |
-
def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str):
|
| 251 |
if "/" not in dataset.strip().strip("/"):
|
| 252 |
return {
|
| 253 |
subset_dropdown: gr.Dropdown(visible=False),
|
| 254 |
split_dropdown: gr.Dropdown(visible=False),
|
| 255 |
-
text_column_dropdown: gr.Dropdown(info="Text colum name to check
|
|
|
|
| 256 |
}
|
| 257 |
info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
| 258 |
if "error" in info_resp:
|
| 259 |
return {
|
| 260 |
subset_dropdown: gr.Dropdown(visible=False),
|
| 261 |
split_dropdown: gr.Dropdown(visible=False),
|
| 262 |
-
text_column_dropdown: gr.Dropdown(label="Text column name", info="Text colum name to check
|
|
|
|
| 263 |
}
|
| 264 |
subsets: list[str] = list(info_resp["dataset_info"])
|
| 265 |
subset = default_subset if default_subset in subsets else subsets[0]
|
| 266 |
splits: list[str] = info_resp["dataset_info"][subset]["splits"]
|
| 267 |
split = default_split if default_split in splits else splits[0]
|
| 268 |
features = info_resp["dataset_info"][subset]["features"]
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
return {
|
| 271 |
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
| 272 |
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
| 273 |
-
text_column_dropdown: gr.Dropdown(choices=text_features, label="Text column name", info="Text colum name to check (only non-nested texts are supported)"),
|
|
|
|
| 274 |
}
|
| 275 |
|
| 276 |
-
@dataset_name.change(inputs=[dataset_name], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
|
| 277 |
def show_input_from_subset_dropdown(dataset: str) -> dict:
|
| 278 |
-
return _resolve_dataset_selection(dataset, default_subset="default", default_split="train")
|
| 279 |
|
| 280 |
-
@subset_dropdown.change(inputs=[dataset_name, subset_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
|
| 281 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
| 282 |
-
return _resolve_dataset_selection(dataset, default_subset=subset, default_split="train")
|
| 283 |
|
| 284 |
-
@split_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
|
| 285 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
| 286 |
-
return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
|
| 288 |
gr.Markdown("## Run nvidia quality classifier")
|
| 289 |
batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
|
|
@@ -305,13 +344,13 @@ with gr.Blocks() as demo:
|
|
| 305 |
|
| 306 |
gr.Examples(
|
| 307 |
[
|
| 308 |
-
["HuggingFaceFW/fineweb-edu", "default", "train", "text", 16, 500],
|
| 309 |
-
["fka/awesome-chatgpt-prompts", "default", "train", "prompt", 64, 200],
|
| 310 |
-
["proj-persona/PersonaHub", "instruction", "train", "synthesized text", 32, 1000],
|
| 311 |
-
["argilla/FinePersonas-v0.1", "default", "train", "persona", 64, 1000],
|
| 312 |
-
["
|
| 313 |
],
|
| 314 |
-
[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, batch_size, num_examples],
|
| 315 |
[progress_bar, plot, df_low, df_medium, df_high, texts_df],
|
| 316 |
fn=run_quality_check,
|
| 317 |
run_on_click=False,
|
|
@@ -320,7 +359,7 @@ with gr.Blocks() as demo:
|
|
| 320 |
|
| 321 |
gr_check_btn.click(
|
| 322 |
run_quality_check,
|
| 323 |
-
inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, batch_size, num_examples],
|
| 324 |
outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
|
| 325 |
)
|
| 326 |
|
|
@@ -335,7 +374,7 @@ with gr.Blocks() as demo:
|
|
| 335 |
toxicity_df = gr.DataFrame()
|
| 336 |
gr_toxicity_btn.click(
|
| 337 |
call_perspective_api,
|
| 338 |
-
inputs=[texts_df, text_column_dropdown, dataset_name, subset_dropdown, split_dropdown],#, checkbox],
|
| 339 |
outputs=[toxicity_progress_bar, toxicity_hist, toxicity_df]
|
| 340 |
)
|
| 341 |
|
|
|
|
| 107 |
logging.info("Data fetched.")
|
| 108 |
|
| 109 |
data_sample = data.sample(num_examples, seed=16) if data.shape[0] > num_examples else data
|
| 110 |
+
texts = data_sample[column].to_list()
|
| 111 |
+
if nested_column:
|
| 112 |
+
texts = [text[nested_column] for text in texts]
|
| 113 |
predictions, texts_processed = [], []
|
| 114 |
num_examples = min(len(texts), num_examples)
|
| 115 |
for i in range(0, num_examples, batch_size):
|
|
|
|
| 146 |
|
| 147 |
return fig
|
| 148 |
|
| 149 |
+
def call_perspective_api(texts_df, column_name, nested_column_name, dataset, config, split):#, full_check=False):
|
| 150 |
headers = {
|
| 151 |
"content-type": "application/json",
|
| 152 |
}
|
|
|
|
| 156 |
if texts_df.values.tolist() == [['', '', '']]:
|
| 157 |
logging.info(f"Fetching data for {dataset=} {config=} {split=} {column_name=}")
|
| 158 |
try:
|
| 159 |
+
filename = get_first_parquet_filename(dataset, config, split)
|
| 160 |
+
except Exception as error:
|
| 161 |
+
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
| 162 |
+
return
|
| 163 |
+
|
| 164 |
+
try:
|
| 165 |
+
logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
|
| 166 |
+
texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
|
| 167 |
+
except Exception as error:
|
| 168 |
+
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
| 169 |
+
return
|
| 170 |
logging.info("Data fetched.")
|
| 171 |
texts_df = texts_df.to_pandas()
|
| 172 |
|
|
|
|
| 173 |
texts = texts_df.sample(100, random_state=16)[column_name].values if texts_df.shape[0] > 100 else texts_df[column_name].values
|
| 174 |
+
if nested_column_name:
|
| 175 |
+
texts = [text[nested_column_name] for text in texts]
|
| 176 |
|
| 177 |
n_samples = len(texts)
|
| 178 |
for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
|
|
|
|
| 249 |
"""
|
| 250 |
return gr.HTML(value=html_code)
|
| 251 |
|
| 252 |
+
with gr.Row():
|
| 253 |
+
text_column_dropdown = gr.Dropdown(label="Text column name", info="Text colum name to check. ")
|
| 254 |
+
nested_text_column_dropdown = gr.Dropdown(label="Nested text key")#, visible=False)
|
| 255 |
|
| 256 |
+
def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str, text_feature):
|
| 257 |
if "/" not in dataset.strip().strip("/"):
|
| 258 |
return {
|
| 259 |
subset_dropdown: gr.Dropdown(visible=False),
|
| 260 |
split_dropdown: gr.Dropdown(visible=False),
|
| 261 |
+
text_column_dropdown: gr.Dropdown(info="Text colum name to check"),
|
| 262 |
+
nested_text_column_dropdown: gr.Dropdown(visible=False)
|
| 263 |
}
|
| 264 |
info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
| 265 |
if "error" in info_resp:
|
| 266 |
return {
|
| 267 |
subset_dropdown: gr.Dropdown(visible=False),
|
| 268 |
split_dropdown: gr.Dropdown(visible=False),
|
| 269 |
+
text_column_dropdown: gr.Dropdown(label="Text column name", info="Text colum name to check"),
|
| 270 |
+
nested_text_column_dropdown: gr.Dropdown(visible=False)
|
| 271 |
}
|
| 272 |
subsets: list[str] = list(info_resp["dataset_info"])
|
| 273 |
subset = default_subset if default_subset in subsets else subsets[0]
|
| 274 |
splits: list[str] = info_resp["dataset_info"][subset]["splits"]
|
| 275 |
split = default_split if default_split in splits else splits[0]
|
| 276 |
features = info_resp["dataset_info"][subset]["features"]
|
| 277 |
+
|
| 278 |
+
def _is_string_feature(feature):
|
| 279 |
+
return isinstance(feature, dict) and feature.get("dtype") == "string"
|
| 280 |
+
|
| 281 |
+
text_features = [feature_name for feature_name, feature in features.items() if _is_string_feature(feature)]
|
| 282 |
+
nested_features = [feature_name for feature_name, feature in features.items() if isinstance(feature, dict) and isinstance(next(iter(feature.values())), dict)]
|
| 283 |
+
nested_text_features = [feature_name for feature_name in nested_features if any(_is_string_feature(nested_feature) for nested_feature in features[feature_name].values())]
|
| 284 |
+
if not text_feature:
|
| 285 |
+
return {
|
| 286 |
+
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
| 287 |
+
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
| 288 |
+
text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features, label="Text column name",
|
| 289 |
+
info="Text colum name to check"),
|
| 290 |
+
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
| 291 |
+
}
|
| 292 |
+
logging.info(nested_text_features)
|
| 293 |
+
if text_feature in nested_text_features:
|
| 294 |
+
nested_keys = [feature_name for feature_name, feature in features[text_feature].items() if _is_string_feature(feature)]
|
| 295 |
+
return {
|
| 296 |
+
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
| 297 |
+
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
| 298 |
+
text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features,
|
| 299 |
+
label="Text column name",
|
| 300 |
+
info="Text colum name to check (only non-nested texts are supported)"),
|
| 301 |
+
nested_text_column_dropdown: gr.Dropdown(value=nested_keys[0], choices=nested_keys,
|
| 302 |
+
label="Nested text column name", visible=True)
|
| 303 |
+
}
|
| 304 |
return {
|
| 305 |
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
| 306 |
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
| 307 |
+
text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features, label="Text column name", info="Text colum name to check (only non-nested texts are supported)"),
|
| 308 |
+
nested_text_column_dropdown: gr.Dropdown(visible=False),
|
| 309 |
}
|
| 310 |
|
| 311 |
+
@dataset_name.change(inputs=[dataset_name], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
|
| 312 |
def show_input_from_subset_dropdown(dataset: str) -> dict:
|
| 313 |
+
return _resolve_dataset_selection(dataset, default_subset="default", default_split="train", text_feature=None)
|
| 314 |
|
| 315 |
+
@subset_dropdown.change(inputs=[dataset_name, subset_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
|
| 316 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
| 317 |
+
return _resolve_dataset_selection(dataset, default_subset=subset, default_split="train", text_feature=None)
|
| 318 |
|
| 319 |
+
@split_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
|
| 320 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
| 321 |
+
return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split, text_feature=None)
|
| 322 |
+
|
| 323 |
+
@text_column_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
|
| 324 |
+
def show_input_from_text_column_dropdown(dataset: str, subset: str, split: str, text_column) -> dict:
|
| 325 |
+
return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split, text_feature=text_column)
|
| 326 |
|
| 327 |
gr.Markdown("## Run nvidia quality classifier")
|
| 328 |
batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
|
|
|
|
| 344 |
|
| 345 |
gr.Examples(
|
| 346 |
[
|
| 347 |
+
["HuggingFaceFW/fineweb-edu", "default", "train", "text", None, 16, 500],
|
| 348 |
+
# ["fka/awesome-chatgpt-prompts", "default", "train", "prompt", 64, 200],
|
| 349 |
+
# ["proj-persona/PersonaHub", "instruction", "train", "synthesized text", 32, 1000],
|
| 350 |
+
["argilla/FinePersonas-v0.1", "default", "train", "persona", None, 64, 1000],
|
| 351 |
+
["allenai/real-toxicity-prompts", "default", "train", "continuation", "text", 64, 1000],
|
| 352 |
],
|
| 353 |
+
[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown, batch_size, num_examples],
|
| 354 |
[progress_bar, plot, df_low, df_medium, df_high, texts_df],
|
| 355 |
fn=run_quality_check,
|
| 356 |
run_on_click=False,
|
|
|
|
| 359 |
|
| 360 |
gr_check_btn.click(
|
| 361 |
run_quality_check,
|
| 362 |
+
inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown, batch_size, num_examples],
|
| 363 |
outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
|
| 364 |
)
|
| 365 |
|
|
|
|
| 374 |
toxicity_df = gr.DataFrame()
|
| 375 |
gr_toxicity_btn.click(
|
| 376 |
call_perspective_api,
|
| 377 |
+
inputs=[texts_df, text_column_dropdown, nested_text_column_dropdown, dataset_name, subset_dropdown, split_dropdown],#, checkbox],
|
| 378 |
outputs=[toxicity_progress_bar, toxicity_hist, toxicity_df]
|
| 379 |
)
|
| 380 |
|