Spaces:
Runtime error
Runtime error
polinaeterna
commited on
Commit
Β·
e5960a0
1
Parent(s):
4105710
set max examples manually
Browse files
app.py
CHANGED
|
@@ -56,19 +56,20 @@ def plot_and_df(texts, preds):
|
|
| 56 |
)
|
| 57 |
|
| 58 |
|
| 59 |
-
def run_quality_check(dataset, column, batch_size):
|
| 60 |
config = "default"
|
| 61 |
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
|
| 62 |
texts = data[column].to_list()
|
| 63 |
# batch_size = 100
|
| 64 |
predictions, texts_processed = [], []
|
| 65 |
-
for i in range(0, min(len(texts),
|
| 66 |
batch_texts = texts[i:i+batch_size]
|
| 67 |
batch_predictions = predict(batch_texts)
|
| 68 |
predictions.extend(batch_predictions)
|
| 69 |
texts_processed.extend(batch_texts)
|
| 70 |
yield plot_and_df(texts_processed, predictions)
|
| 71 |
|
|
|
|
| 72 |
with gr.Blocks() as demo:
|
| 73 |
gr.Markdown("# π« Dataset Quality Checker π«")
|
| 74 |
dataset_name = HuggingfaceHubSearch(
|
|
@@ -91,11 +92,12 @@ with gr.Blocks() as demo:
|
|
| 91 |
return gr.HTML(value=html_code)
|
| 92 |
text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
|
| 93 |
batch_size = gr.Number(100, label="Batch size")
|
|
|
|
| 94 |
gr_check_btn = gr.Button("Check Dataset")
|
| 95 |
plot = gr.BarPlot()
|
| 96 |
|
| 97 |
with gr.Accordion("Explore some individual examples for each class", open=False):
|
| 98 |
df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
|
| 99 |
-
gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size], outputs=[plot, df_low, df_medium, df_high])
|
| 100 |
|
| 101 |
demo.launch()
|
|
|
|
| 56 |
)
|
| 57 |
|
| 58 |
|
| 59 |
+
def run_quality_check(dataset, column, batch_size, num_examples):
|
| 60 |
config = "default"
|
| 61 |
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
|
| 62 |
texts = data[column].to_list()
|
| 63 |
# batch_size = 100
|
| 64 |
predictions, texts_processed = [], []
|
| 65 |
+
for i in range(0, min(len(texts), num_examples), batch_size):
|
| 66 |
batch_texts = texts[i:i+batch_size]
|
| 67 |
batch_predictions = predict(batch_texts)
|
| 68 |
predictions.extend(batch_predictions)
|
| 69 |
texts_processed.extend(batch_texts)
|
| 70 |
yield plot_and_df(texts_processed, predictions)
|
| 71 |
|
| 72 |
+
|
| 73 |
with gr.Blocks() as demo:
|
| 74 |
gr.Markdown("# π« Dataset Quality Checker π«")
|
| 75 |
dataset_name = HuggingfaceHubSearch(
|
|
|
|
| 92 |
return gr.HTML(value=html_code)
|
| 93 |
text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
|
| 94 |
batch_size = gr.Number(100, label="Batch size")
|
| 95 |
+
num_examples = gr.Number(1000, label="Num examples to check")
|
| 96 |
gr_check_btn = gr.Button("Check Dataset")
|
| 97 |
plot = gr.BarPlot()
|
| 98 |
|
| 99 |
with gr.Accordion("Explore some individual examples for each class", open=False):
|
| 100 |
df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
|
| 101 |
+
gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size, num_examples], outputs=[plot, df_low, df_medium, df_high])
|
| 102 |
|
| 103 |
demo.launch()
|