Spaces:
Runtime error
Runtime error
polinaeterna
commited on
Commit
Β·
ac73d94
1
Parent(s):
9962eae
fix toxicity for bad requests
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import requests
|
| 2 |
from collections import Counter
|
| 3 |
|
| 4 |
-
from fontTools.subset import subset
|
| 5 |
from requests.adapters import HTTPAdapter, Retry
|
| 6 |
import os
|
| 7 |
import time
|
|
@@ -101,6 +100,7 @@ def run_quality_check(dataset, config, split, column, nested_column, batch_size,
|
|
| 101 |
|
| 102 |
try:
|
| 103 |
logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
|
|
|
|
| 104 |
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column])
|
| 105 |
except Exception as error:
|
| 106 |
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
|
@@ -151,7 +151,8 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
|
|
| 151 |
headers = {
|
| 152 |
"content-type": "application/json",
|
| 153 |
}
|
| 154 |
-
req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
|
|
|
|
| 155 |
|
| 156 |
# fetch data if it doesn't exist yet
|
| 157 |
if texts_df.values.tolist() == [['', '', '']]:
|
|
@@ -164,6 +165,7 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
|
|
| 164 |
|
| 165 |
try:
|
| 166 |
logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
|
|
|
|
| 167 |
texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
|
| 168 |
except Exception as error:
|
| 169 |
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
|
@@ -188,11 +190,13 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
|
|
| 188 |
except Exception as e:
|
| 189 |
logging.info(e)
|
| 190 |
logging.info(data)
|
| 191 |
-
|
|
|
|
| 192 |
|
| 193 |
if req_response.ok:
|
| 194 |
response = req_response.json()
|
| 195 |
if ATT_SCORE in response:
|
|
|
|
| 196 |
for req_att in REQUESTED_ATTRIBUTES:
|
| 197 |
if req_att in response[ATT_SCORE]:
|
| 198 |
att_score = response[ATT_SCORE][req_att][SUM_SCORE]["value"]
|
|
@@ -206,13 +210,16 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
|
|
| 206 |
req_response.raise_for_status()
|
| 207 |
except Exception as e:
|
| 208 |
logging.info(e)
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
| 210 |
if i % 10 == 0:
|
| 211 |
plot_toxicity(req_att_scores)
|
| 212 |
-
yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({
|
| 213 |
|
| 214 |
plot_toxicity(req_att_scores)
|
| 215 |
-
yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({
|
| 216 |
|
| 217 |
|
| 218 |
with gr.Blocks() as demo:
|
|
@@ -326,7 +333,7 @@ with gr.Blocks() as demo:
|
|
| 326 |
gr.Markdown("## Run nvidia quality classifier")
|
| 327 |
batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
|
| 328 |
num_examples = gr.Slider(0, 5000, 500, step=10, label="Number of examples", info="Number of random examples to run quality classifier on")
|
| 329 |
-
gr_check_btn = gr.Button("Check
|
| 330 |
progress_bar = gr.Label(show_label=False)
|
| 331 |
plot = gr.BarPlot()
|
| 332 |
|
|
@@ -365,7 +372,7 @@ with gr.Blocks() as demo:
|
|
| 365 |
gr.Markdown("""## Explore toxicity
|
| 366 |
Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
|
| 367 |
""")
|
| 368 |
-
gr_toxicity_btn = gr.Button("
|
| 369 |
toxicity_progress_bar = gr.Label(show_label=False)
|
| 370 |
toxicity_hist = gr.Plot()
|
| 371 |
with gr.Accordion("Explore examples with toxicity scores:", open=False):
|
|
|
|
| 1 |
import requests
|
| 2 |
from collections import Counter
|
| 3 |
|
|
|
|
| 4 |
from requests.adapters import HTTPAdapter, Retry
|
| 5 |
import os
|
| 6 |
import time
|
|
|
|
| 100 |
|
| 101 |
try:
|
| 102 |
logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
|
| 103 |
+
yield f"loading data...", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
| 104 |
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column])
|
| 105 |
except Exception as error:
|
| 106 |
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
|
|
|
| 151 |
headers = {
|
| 152 |
"content-type": "application/json",
|
| 153 |
}
|
| 154 |
+
req_att_scores = {**{attr: [] for attr in REQUESTED_ATTRIBUTES}}
|
| 155 |
+
texts_processed = {column_name: []}
|
| 156 |
|
| 157 |
# fetch data if it doesn't exist yet
|
| 158 |
if texts_df.values.tolist() == [['', '', '']]:
|
|
|
|
| 165 |
|
| 166 |
try:
|
| 167 |
logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
|
| 168 |
+
yield f"loading data...", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
| 169 |
texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
|
| 170 |
except Exception as error:
|
| 171 |
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
|
|
|
| 190 |
except Exception as e:
|
| 191 |
logging.info(e)
|
| 192 |
logging.info(data)
|
| 193 |
+
yield {"bad request, example skipped...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
|
| 194 |
+
continue
|
| 195 |
|
| 196 |
if req_response.ok:
|
| 197 |
response = req_response.json()
|
| 198 |
if ATT_SCORE in response:
|
| 199 |
+
texts_processed[column_name].append(text)
|
| 200 |
for req_att in REQUESTED_ATTRIBUTES:
|
| 201 |
if req_att in response[ATT_SCORE]:
|
| 202 |
att_score = response[ATT_SCORE][req_att][SUM_SCORE]["value"]
|
|
|
|
| 210 |
req_response.raise_for_status()
|
| 211 |
except Exception as e:
|
| 212 |
logging.info(e)
|
| 213 |
+
logging.info(data)
|
| 214 |
+
yield {"bad request, example skipped": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
|
| 215 |
+
continue
|
| 216 |
+
|
| 217 |
if i % 10 == 0:
|
| 218 |
plot_toxicity(req_att_scores)
|
| 219 |
+
yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
|
| 220 |
|
| 221 |
plot_toxicity(req_att_scores)
|
| 222 |
+
yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
|
| 223 |
|
| 224 |
|
| 225 |
with gr.Blocks() as demo:
|
|
|
|
| 333 |
gr.Markdown("## Run nvidia quality classifier")
|
| 334 |
batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
|
| 335 |
num_examples = gr.Slider(0, 5000, 500, step=10, label="Number of examples", info="Number of random examples to run quality classifier on")
|
| 336 |
+
gr_check_btn = gr.Button("Check Quality")
|
| 337 |
progress_bar = gr.Label(show_label=False)
|
| 338 |
plot = gr.BarPlot()
|
| 339 |
|
|
|
|
| 372 |
gr.Markdown("""## Explore toxicity
|
| 373 |
Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
|
| 374 |
""")
|
| 375 |
+
gr_toxicity_btn = gr.Button("Check Toxicity")
|
| 376 |
toxicity_progress_bar = gr.Label(show_label=False)
|
| 377 |
toxicity_hist = gr.Plot()
|
| 378 |
with gr.Accordion("Explore examples with toxicity scores:", open=False):
|