Spaces:
Runtime error
Runtime error
polinaeterna
commited on
Commit
·
44cbba4
1
Parent(s):
7badbdb
get config and split with api, include partial datasets
Browse files
app.py
CHANGED
|
@@ -1,13 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
import polars as pl
|
|
|
|
| 3 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
|
|
|
| 4 |
import torch
|
| 5 |
-
import spaces
|
| 6 |
from torch import nn
|
| 7 |
from transformers import AutoModel, AutoTokenizer, AutoConfig
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
class QualityModel(nn.Module, PyTorchModelHubMixin):
|
|
@@ -64,8 +73,22 @@ def plot_and_df(texts, preds):
|
|
| 64 |
|
| 65 |
|
| 66 |
def run_quality_check(dataset, column, batch_size, num_examples):
|
| 67 |
-
config = "default"
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
texts = data[column].to_list()
|
| 70 |
# batch_size = 100
|
| 71 |
predictions, texts_processed = [], []
|
|
@@ -106,8 +129,8 @@ with gr.Blocks() as demo:
|
|
| 106 |
return gr.HTML(value=html_code)
|
| 107 |
|
| 108 |
text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
|
| 109 |
-
batch_size = gr.Slider(0, 128,
|
| 110 |
-
num_examples = gr.Number(
|
| 111 |
gr_check_btn = gr.Button("Check Dataset")
|
| 112 |
progress_bar = gr.Label(show_label=False)
|
| 113 |
plot = gr.BarPlot()
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from collections import Counter
|
| 3 |
+
from requests.adapters import HTTPAdapter, Retry
|
| 4 |
+
|
| 5 |
import gradio as gr
|
| 6 |
+
import pandas as pd
|
| 7 |
import polars as pl
|
| 8 |
+
import spaces
|
| 9 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 10 |
+
from huggingface_hub import PyTorchModelHubMixin
|
| 11 |
import torch
|
|
|
|
| 12 |
from torch import nn
|
| 13 |
from transformers import AutoModel, AutoTokenizer, AutoConfig
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
session = requests.Session()
|
| 18 |
+
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
|
| 19 |
+
session.mount('http://', HTTPAdapter(max_retries=retries))
|
| 20 |
|
| 21 |
|
| 22 |
class QualityModel(nn.Module, PyTorchModelHubMixin):
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
def run_quality_check(dataset, column, batch_size, num_examples):
|
| 76 |
+
# config = "default"
|
| 77 |
+
info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
| 78 |
+
if "error" in info_resp:
|
| 79 |
+
yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
| 80 |
+
return
|
| 81 |
+
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
|
| 82 |
+
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
|
| 83 |
+
iter(info_resp["dataset_info"][config]["splits"]))
|
| 84 |
+
try:
|
| 85 |
+
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
|
| 86 |
+
except pl.exceptions.ComputeError:
|
| 87 |
+
try:
|
| 88 |
+
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
|
| 89 |
+
except Exception as error:
|
| 90 |
+
yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
| 91 |
+
return
|
| 92 |
texts = data[column].to_list()
|
| 93 |
# batch_size = 100
|
| 94 |
predictions, texts_processed = [], []
|
|
|
|
| 129 |
return gr.HTML(value=html_code)
|
| 130 |
|
| 131 |
text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
|
| 132 |
+
batch_size = gr.Slider(0, 128, 32, step=8, label="Inference batch size (set this to smaller value if this space crashes.)")
|
| 133 |
+
num_examples = gr.Number(500, label="Number of first examples to check")
|
| 134 |
gr_check_btn = gr.Button("Check Dataset")
|
| 135 |
progress_bar = gr.Label(show_label=False)
|
| 136 |
plot = gr.BarPlot()
|