Spaces:

librarian-bots
/

new-datasets-in-machine-learning

Running

App Files Files Community

davanstrien HF Staff commited on Oct 6, 2023

Commit

976f652

1 Parent(s): ad38c8f

improvements

Browse files

Files changed (1) hide show

app.py +66 -18

app.py CHANGED Viewed

@@ -40,7 +40,7 @@ def format_row_for_model(row):
 int2label = {0: "new_dataset", 1: "not_new_dataset"}
-def get_predictions(data: list[dict], model=None, batch_size=32):
     if model is None:
         model = load_model()
     predictions = []
@@ -65,8 +65,8 @@ def create_markdown(row):
     updated = updated.strftime("%Y-%m-%d")
     broad_category = row["broad_category"]
     category = row["category"]
-    return f""" <h1> {title} </h1> updated: {updated}
-    | category: {broad_category}  | subcategory: {category} |
 \n\n{abstract}
 \n\n [Hugging Face Papers page]({hub_paper_url})
     """
@@ -87,34 +87,82 @@ def prepare_data():
     return df
-all_possible_arxiv_categories = prepare_data().category.unique().tolist()
-broad_categories = prepare_data().broad_category.unique().tolist()
-def create_markdown_summary(categories=broad_categories, all_categories=None):
     df = prepare_data()
-    if categories is not None:
         df = df[df["broad_category"].isin(categories)]
-    return "\n\n".join(df["markdown"].tolist())
 scheduler = BackgroundScheduler()
 scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
 scheduler.start()
 with gr.Blocks() as demo:
-    gr.Markdown("## New Datasets in Machine Learning")
     gr.Markdown(
-        "This Space attempts to show new papers on arXiv that are *likely* to be papers"
-        " introducing new datasets. \n\n"
-    )
-    broad_categories = gr.Dropdown(
-        choices=broad_categories,
-        label="Categories",
-        multiselect=True,
-        value=broad_categories,
     )
     results = gr.Markdown(create_markdown_summary())
-    broad_categories.change(create_markdown_summary, broad_categories, results)
 demo.launch()

 int2label = {0: "new_dataset", 1: "not_new_dataset"}
+def get_predictions(data: list[dict], model=None, batch_size=64):
     if model is None:
         model = load_model()
     predictions = []
     updated = updated.strftime("%Y-%m-%d")
     broad_category = row["broad_category"]
     category = row["category"]
+    return f""" <h2> {title} </h2> Updated: {updated}
+    | Category: {broad_category}  | Subcategory: {category} |
 \n\n{abstract}
 \n\n [Hugging Face Papers page]({hub_paper_url})
     """
     return df
+all_possible_arxiv_categories = sorted(prepare_data().category.unique().tolist())
+broad_categories = sorted(prepare_data().broad_category.unique().tolist())
+def create_markdown_summary(categories=None, new_only=True, narrow_categories=None):
     df = prepare_data()
+    if new_only:
+        df = df[df["prediction"] == "new_dataset"]
+    if narrow_categories is not None:
+        df = df[df["category"].isin(narrow_categories)]
+    if categories is not None and not narrow_categories:
         df = df[df["broad_category"].isin(categories)]
+    number_of_results = len(df)
+    results = (
+        "<h1 style='text-align: center'> arXiv papers related to datasets</h1> \n\n"
+    )
+    results += f"Number of results: {number_of_results}\n\n"
+    results += "\n\n<br>".join(df["markdown"].tolist())
+    return results
 scheduler = BackgroundScheduler()
 scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
 scheduler.start()
+description = """This Space shows recent papers on arXiv that are *likely* to be papers introducing new datasets related to machine learning. \n\n
+The Space works by:
+- searching for papers on arXiv with the term `dataset` in the title + "machine learning" in the abstract
+- passing the abstract and title of the papers to a machine learning model that predicts if the paper is introducing a new dataset or not
+This Space is a WIP in progress. The model is not perfect, and the search query is not perfect. If you have  suggestions for how to improve this Space, please open a Discussion.\n\n"""
 with gr.Blocks() as demo:
     gr.Markdown(
+        "<h1 style='text-align: center'>  &#x2728;New Datasets in Machine Learning "
+        " &#x2728; </h1>"
     )
+    gr.Markdown(description)
+    with gr.Row():
+        broad_categories = gr.Dropdown(
+            choices=broad_categories,
+            label="Broad arXiv Category",
+            multiselect=True,
+            value="cs",
+            size="sm",
+        )
+    with gr.Accordion("Advanced Options", open=False):
+        gr.Markdown(
+            "Narrow by arXiv categories. **Note** this will take precedence over the"
+            " broad category selection."
+        )
+        narrow_categories = gr.Dropdown(
+            choices=all_possible_arxiv_categories,
+            value=None,
+            multiselect=True,
+            label="Narrow arXiv Category",
+        )
+        gr.ClearButton(narrow_categories, "Clear Narrow Categories", size="sm")
+    with gr.Row():
+        new_only = gr.Checkbox(True, label="New Datasets Only", size="sm")
     results = gr.Markdown(create_markdown_summary())
+    broad_categories.change(
+        create_markdown_summary,
+        inputs=[broad_categories, new_only, narrow_categories],
+        outputs=results,
+    )
+    narrow_categories.change(
+        create_markdown_summary,
+        inputs=[broad_categories, new_only, narrow_categories],
+        outputs=results,
+    )
+    new_only.select(
+        create_markdown_summary,
+        [broad_categories, new_only, narrow_categories],
+        results,
+    )
 demo.launch()