Spaces:
Runtime error
Runtime error
polinaeterna
commited on
Commit
Β·
8d6975b
1
Parent(s):
2bd0078
update
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import gradio as gr
|
|
| 2 |
import polars as pl
|
| 3 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 4 |
import torch
|
| 5 |
-
import spaces
|
| 6 |
from torch import nn
|
| 7 |
from transformers import AutoModel, AutoTokenizer, AutoConfig
|
| 8 |
from huggingface_hub import PyTorchModelHubMixin
|
|
@@ -31,7 +31,7 @@ model = QualityModel.from_pretrained("nvidia/quality-classifier-deberta").to(dev
|
|
| 31 |
model.eval()
|
| 32 |
|
| 33 |
|
| 34 |
-
@spaces.GPU
|
| 35 |
def predict(texts: list[str]):
|
| 36 |
inputs = tokenizer(
|
| 37 |
texts, return_tensors="pt", padding="longest", truncation=True
|
|
@@ -46,26 +46,26 @@ def predict(texts: list[str]):
|
|
| 46 |
|
| 47 |
def run_quality_check(dataset, column, n_samples):
|
| 48 |
config = "default"
|
| 49 |
-
data = pl.read_parquet(f"hf://datasets/{dataset}
|
| 50 |
-
texts = data[column].
|
| 51 |
predictions = predict(texts[:n_samples])
|
| 52 |
-
return pd.DataFrame({"quality": predictions})
|
| 53 |
|
| 54 |
|
| 55 |
with gr.Blocks() as demo:
|
| 56 |
gr.Markdown("# π« Dataset Quality Checker π«")
|
| 57 |
-
|
| 58 |
label="Hub Dataset ID",
|
| 59 |
placeholder="Search for dataset id on Huggingface",
|
| 60 |
search_type="dataset",
|
| 61 |
value="fka/awesome-chatgpt-prompts",
|
| 62 |
)
|
| 63 |
-
dataset_name = HuggingfaceHubSearch(
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
)
|
| 69 |
# config_name = "default" # TODO: user input
|
| 70 |
@gr.render(inputs=dataset_name)
|
| 71 |
def embed(name):
|
|
@@ -82,6 +82,8 @@ with gr.Blocks() as demo:
|
|
| 82 |
n_samples = gr.Number(label="Num first samples to run check")
|
| 83 |
gr_check_btn = gr.Button("Check Dataset")
|
| 84 |
# plot = gr.BarPlot()
|
| 85 |
-
df = gr.DataFrame(
|
| 86 |
gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, n_samples], outputs=[df])
|
| 87 |
-
gr.BarPlot(df)
|
|
|
|
|
|
|
|
|
| 2 |
import polars as pl
|
| 3 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 4 |
import torch
|
| 5 |
+
# import spaces
|
| 6 |
from torch import nn
|
| 7 |
from transformers import AutoModel, AutoTokenizer, AutoConfig
|
| 8 |
from huggingface_hub import PyTorchModelHubMixin
|
|
|
|
| 31 |
model.eval()
|
| 32 |
|
| 33 |
|
| 34 |
+
# @spaces.GPU
|
| 35 |
def predict(texts: list[str]):
|
| 36 |
inputs = tokenizer(
|
| 37 |
texts, return_tensors="pt", padding="longest", truncation=True
|
|
|
|
| 46 |
|
| 47 |
def run_quality_check(dataset, column, n_samples):
|
| 48 |
config = "default"
|
| 49 |
+
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
|
| 50 |
+
texts = data[column].to_list()
|
| 51 |
predictions = predict(texts[:n_samples])
|
| 52 |
+
return pd.DataFrame({"quality": predictions})
|
| 53 |
|
| 54 |
|
| 55 |
with gr.Blocks() as demo:
|
| 56 |
gr.Markdown("# π« Dataset Quality Checker π«")
|
| 57 |
+
dataset_name = HuggingfaceHubSearch(
|
| 58 |
label="Hub Dataset ID",
|
| 59 |
placeholder="Search for dataset id on Huggingface",
|
| 60 |
search_type="dataset",
|
| 61 |
value="fka/awesome-chatgpt-prompts",
|
| 62 |
)
|
| 63 |
+
# dataset_name = HuggingfaceHubSearch(
|
| 64 |
+
# label="Hub Dataset ID",
|
| 65 |
+
# placeholder="Search for dataset id on Huggingface",
|
| 66 |
+
# search_type="dataset",
|
| 67 |
+
# value="HuggingFaceFW/fineweb",
|
| 68 |
+
# )
|
| 69 |
# config_name = "default" # TODO: user input
|
| 70 |
@gr.render(inputs=dataset_name)
|
| 71 |
def embed(name):
|
|
|
|
| 82 |
n_samples = gr.Number(label="Num first samples to run check")
|
| 83 |
gr_check_btn = gr.Button("Check Dataset")
|
| 84 |
# plot = gr.BarPlot()
|
| 85 |
+
df = gr.DataFrame()
|
| 86 |
gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, n_samples], outputs=[df])
|
| 87 |
+
# gr.BarPlot(df)
|
| 88 |
+
|
| 89 |
+
demo.launch()
|