Commit
·
976f652
1
Parent(s):
ad38c8f
improvements
Browse files
app.py
CHANGED
|
@@ -40,7 +40,7 @@ def format_row_for_model(row):
|
|
| 40 |
int2label = {0: "new_dataset", 1: "not_new_dataset"}
|
| 41 |
|
| 42 |
|
| 43 |
-
def get_predictions(data: list[dict], model=None, batch_size=
|
| 44 |
if model is None:
|
| 45 |
model = load_model()
|
| 46 |
predictions = []
|
|
@@ -65,8 +65,8 @@ def create_markdown(row):
|
|
| 65 |
updated = updated.strftime("%Y-%m-%d")
|
| 66 |
broad_category = row["broad_category"]
|
| 67 |
category = row["category"]
|
| 68 |
-
return f""" <
|
| 69 |
-
|
|
| 70 |
\n\n{abstract}
|
| 71 |
\n\n [Hugging Face Papers page]({hub_paper_url})
|
| 72 |
"""
|
|
@@ -87,34 +87,82 @@ def prepare_data():
|
|
| 87 |
return df
|
| 88 |
|
| 89 |
|
| 90 |
-
all_possible_arxiv_categories = prepare_data().category.unique().tolist()
|
| 91 |
-
broad_categories = prepare_data().broad_category.unique().tolist()
|
| 92 |
|
| 93 |
|
| 94 |
-
def create_markdown_summary(categories=
|
| 95 |
df = prepare_data()
|
| 96 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
df = df[df["broad_category"].isin(categories)]
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
|
| 101 |
scheduler = BackgroundScheduler()
|
| 102 |
scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
|
| 103 |
scheduler.start()
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
with gr.Blocks() as demo:
|
| 106 |
-
gr.Markdown("## New Datasets in Machine Learning")
|
| 107 |
gr.Markdown(
|
| 108 |
-
"
|
| 109 |
-
"
|
| 110 |
-
)
|
| 111 |
-
broad_categories = gr.Dropdown(
|
| 112 |
-
choices=broad_categories,
|
| 113 |
-
label="Categories",
|
| 114 |
-
multiselect=True,
|
| 115 |
-
value=broad_categories,
|
| 116 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
results = gr.Markdown(create_markdown_summary())
|
| 118 |
-
broad_categories.change(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
demo.launch()
|
|
|
|
| 40 |
int2label = {0: "new_dataset", 1: "not_new_dataset"}
|
| 41 |
|
| 42 |
|
| 43 |
+
def get_predictions(data: list[dict], model=None, batch_size=64):
|
| 44 |
if model is None:
|
| 45 |
model = load_model()
|
| 46 |
predictions = []
|
|
|
|
| 65 |
updated = updated.strftime("%Y-%m-%d")
|
| 66 |
broad_category = row["broad_category"]
|
| 67 |
category = row["category"]
|
| 68 |
+
return f""" <h2> {title} </h2> Updated: {updated}
|
| 69 |
+
| Category: {broad_category} | Subcategory: {category} |
|
| 70 |
\n\n{abstract}
|
| 71 |
\n\n [Hugging Face Papers page]({hub_paper_url})
|
| 72 |
"""
|
|
|
|
| 87 |
return df
|
| 88 |
|
| 89 |
|
| 90 |
+
all_possible_arxiv_categories = sorted(prepare_data().category.unique().tolist())
|
| 91 |
+
broad_categories = sorted(prepare_data().broad_category.unique().tolist())
|
| 92 |
|
| 93 |
|
| 94 |
+
def create_markdown_summary(categories=None, new_only=True, narrow_categories=None):
|
| 95 |
df = prepare_data()
|
| 96 |
+
if new_only:
|
| 97 |
+
df = df[df["prediction"] == "new_dataset"]
|
| 98 |
+
if narrow_categories is not None:
|
| 99 |
+
df = df[df["category"].isin(narrow_categories)]
|
| 100 |
+
if categories is not None and not narrow_categories:
|
| 101 |
df = df[df["broad_category"].isin(categories)]
|
| 102 |
+
number_of_results = len(df)
|
| 103 |
+
results = (
|
| 104 |
+
"<h1 style='text-align: center'> arXiv papers related to datasets</h1> \n\n"
|
| 105 |
+
)
|
| 106 |
+
results += f"Number of results: {number_of_results}\n\n"
|
| 107 |
+
results += "\n\n<br>".join(df["markdown"].tolist())
|
| 108 |
+
return results
|
| 109 |
|
| 110 |
|
| 111 |
scheduler = BackgroundScheduler()
|
| 112 |
scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
|
| 113 |
scheduler.start()
|
| 114 |
|
| 115 |
+
description = """This Space shows recent papers on arXiv that are *likely* to be papers introducing new datasets related to machine learning. \n\n
|
| 116 |
+
The Space works by:
|
| 117 |
+
- searching for papers on arXiv with the term `dataset` in the title + "machine learning" in the abstract
|
| 118 |
+
- passing the abstract and title of the papers to a machine learning model that predicts if the paper is introducing a new dataset or not
|
| 119 |
+
|
| 120 |
+
This Space is a WIP in progress. The model is not perfect, and the search query is not perfect. If you have suggestions for how to improve this Space, please open a Discussion.\n\n"""
|
| 121 |
+
|
| 122 |
+
|
| 123 |
with gr.Blocks() as demo:
|
|
|
|
| 124 |
gr.Markdown(
|
| 125 |
+
"<h1 style='text-align: center'> ✨New Datasets in Machine Learning "
|
| 126 |
+
" ✨ </h1>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
)
|
| 128 |
+
gr.Markdown(description)
|
| 129 |
+
with gr.Row():
|
| 130 |
+
broad_categories = gr.Dropdown(
|
| 131 |
+
choices=broad_categories,
|
| 132 |
+
label="Broad arXiv Category",
|
| 133 |
+
multiselect=True,
|
| 134 |
+
value="cs",
|
| 135 |
+
size="sm",
|
| 136 |
+
)
|
| 137 |
+
with gr.Accordion("Advanced Options", open=False):
|
| 138 |
+
gr.Markdown(
|
| 139 |
+
"Narrow by arXiv categories. **Note** this will take precedence over the"
|
| 140 |
+
" broad category selection."
|
| 141 |
+
)
|
| 142 |
+
narrow_categories = gr.Dropdown(
|
| 143 |
+
choices=all_possible_arxiv_categories,
|
| 144 |
+
value=None,
|
| 145 |
+
multiselect=True,
|
| 146 |
+
label="Narrow arXiv Category",
|
| 147 |
+
)
|
| 148 |
+
gr.ClearButton(narrow_categories, "Clear Narrow Categories", size="sm")
|
| 149 |
+
with gr.Row():
|
| 150 |
+
new_only = gr.Checkbox(True, label="New Datasets Only", size="sm")
|
| 151 |
results = gr.Markdown(create_markdown_summary())
|
| 152 |
+
broad_categories.change(
|
| 153 |
+
create_markdown_summary,
|
| 154 |
+
inputs=[broad_categories, new_only, narrow_categories],
|
| 155 |
+
outputs=results,
|
| 156 |
+
)
|
| 157 |
+
narrow_categories.change(
|
| 158 |
+
create_markdown_summary,
|
| 159 |
+
inputs=[broad_categories, new_only, narrow_categories],
|
| 160 |
+
outputs=results,
|
| 161 |
+
)
|
| 162 |
+
new_only.select(
|
| 163 |
+
create_markdown_summary,
|
| 164 |
+
[broad_categories, new_only, narrow_categories],
|
| 165 |
+
results,
|
| 166 |
+
)
|
| 167 |
|
| 168 |
demo.launch()
|