Spaces:
Sleeping
Sleeping
Sam-Oliveira commited on
Commit ยท
792575c
1
Parent(s): fd2e156
Change TAB 1 retrieval logic
Browse files- src/scrape.py +29 -23
- src/streamlit_app.py +9 -9
src/scrape.py
CHANGED
|
@@ -61,42 +61,48 @@ def make_tags(title, abstract, top_n=5):
|
|
| 61 |
def scrape(max_results=MAX_RESULTS, **criteria):
|
| 62 |
query = build_query(**criteria)
|
| 63 |
search = arxiv.Search(query=query,
|
| 64 |
-
max_results=max_results,
|
| 65 |
sort_by=arxiv.SortCriterion.SubmittedDate)
|
| 66 |
|
| 67 |
conn = get_conn()
|
| 68 |
-
|
|
|
|
| 69 |
|
| 70 |
for p in search.results():
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
# Check if paper already exists
|
| 74 |
existing = conn.execute("SELECT id FROM papers WHERE id=?", (p.entry_id,)).fetchone()
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
(
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
| 92 |
'title': p.title,
|
| 93 |
'authors': ", ".join(a.name for a in p.authors),
|
| 94 |
'abstract': p.summary,
|
| 95 |
'published': p.published.isoformat()
|
| 96 |
})
|
|
|
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
time.sleep(1)
|
| 99 |
-
conn.commit()
|
| 100 |
|
| 101 |
-
|
|
|
|
| 102 |
|
|
|
|
| 61 |
def scrape(max_results=MAX_RESULTS, **criteria):
|
| 62 |
query = build_query(**criteria)
|
| 63 |
search = arxiv.Search(query=query,
|
| 64 |
+
max_results=max_results * 3, # Get more results to filter from
|
| 65 |
sort_by=arxiv.SortCriterion.SubmittedDate)
|
| 66 |
|
| 67 |
conn = get_conn()
|
| 68 |
+
search_results = [] # Track papers from current search that aren't in database
|
| 69 |
+
papers_added = 0
|
| 70 |
|
| 71 |
for p in search.results():
|
| 72 |
+
# Check if paper already exists in database
|
|
|
|
|
|
|
| 73 |
existing = conn.execute("SELECT id FROM papers WHERE id=?", (p.entry_id,)).fetchone()
|
| 74 |
|
| 75 |
+
if not existing and papers_added < max_results:
|
| 76 |
+
# Paper doesn't exist, add it
|
| 77 |
+
tags = make_tags(p.title, p.summary)
|
| 78 |
+
conn.execute(
|
| 79 |
+
"INSERT INTO papers VALUES (?,?,?,?,?,?,?)",
|
| 80 |
+
(
|
| 81 |
+
p.entry_id,
|
| 82 |
+
p.title,
|
| 83 |
+
", ".join(a.name for a in p.authors),
|
| 84 |
+
p.summary,
|
| 85 |
+
p.published.isoformat(),
|
| 86 |
+
None, # ummary placeholder
|
| 87 |
+
tags
|
| 88 |
+
),
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# Add to search results
|
| 92 |
+
search_results.append({
|
| 93 |
'title': p.title,
|
| 94 |
'authors': ", ".join(a.name for a in p.authors),
|
| 95 |
'abstract': p.summary,
|
| 96 |
'published': p.published.isoformat()
|
| 97 |
})
|
| 98 |
+
papers_added += 1
|
| 99 |
|
| 100 |
+
# Stop if enough papers have been added
|
| 101 |
+
if papers_added >= max_results:
|
| 102 |
+
break
|
| 103 |
+
|
| 104 |
time.sleep(1)
|
|
|
|
| 105 |
|
| 106 |
+
conn.commit()
|
| 107 |
+
return search_results
|
| 108 |
|
src/streamlit_app.py
CHANGED
|
@@ -37,7 +37,7 @@ tab1, tab2, tab3 = st.tabs(["๐ Search", "๐ Digest", "๐ก Ideate"])
|
|
| 37 |
|
| 38 |
|
| 39 |
with tab1:
|
| 40 |
-
st.header("Search papers")
|
| 41 |
c1, c2, c3, c4 = st.columns(4)
|
| 42 |
topic = c1.text_input("Topic")
|
| 43 |
title = c2.text_input("Title")
|
|
@@ -45,18 +45,18 @@ with tab1:
|
|
| 45 |
category = c4.text_input("Category (e.g. cs.CL)")
|
| 46 |
k = st.slider("Max papers", 5, 50, 25)
|
| 47 |
if st.button("Run search"):
|
| 48 |
-
with st.spinner("
|
| 49 |
-
|
| 50 |
author=author, category=category)
|
| 51 |
-
st.success("All done!")
|
| 52 |
|
| 53 |
-
if
|
| 54 |
-
|
|
|
|
| 55 |
paper_rows = [(p['title'], p['authors'], p['abstract'], p['published'])
|
| 56 |
-
for p in
|
| 57 |
st.components.v1.html(render_rows(paper_rows), height=600, scrolling=True)
|
| 58 |
else:
|
| 59 |
-
st.info("No new papers
|
| 60 |
|
| 61 |
|
| 62 |
with tab2:
|
|
@@ -95,6 +95,6 @@ with tab3:
|
|
| 95 |
ideas = ideate_from_ids(ids)
|
| 96 |
if ideas is None:
|
| 97 |
st.info("Those IDs aren't in the database yet. "
|
| 98 |
-
"Fetch them via the
|
| 99 |
else:
|
| 100 |
st.markdown(f"```\n{ideas}\n```")
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
with tab1:
|
| 40 |
+
st.header("Search for papers you have not yet read")
|
| 41 |
c1, c2, c3, c4 = st.columns(4)
|
| 42 |
topic = c1.text_input("Topic")
|
| 43 |
title = c2.text_input("Title")
|
|
|
|
| 45 |
category = c4.text_input("Category (e.g. cs.CL)")
|
| 46 |
k = st.slider("Max papers", 5, 50, 25)
|
| 47 |
if st.button("Run search"):
|
| 48 |
+
with st.spinner("Finding new papers for your search..."):
|
| 49 |
+
search_results = scrape(max_results=k, topic=topic, title=title,
|
| 50 |
author=author, category=category)
|
|
|
|
| 51 |
|
| 52 |
+
if search_results:
|
| 53 |
+
st.success(f"Found {len(search_results)} new papers for your search!")
|
| 54 |
+
# Convert search results to the format expected by render_rows
|
| 55 |
paper_rows = [(p['title'], p['authors'], p['abstract'], p['published'])
|
| 56 |
+
for p in search_results]
|
| 57 |
st.components.v1.html(render_rows(paper_rows), height=600, scrolling=True)
|
| 58 |
else:
|
| 59 |
+
st.info("No new papers found for this search. All recent papers on this topic are already in your database.")
|
| 60 |
|
| 61 |
|
| 62 |
with tab2:
|
|
|
|
| 95 |
ideas = ideate_from_ids(ids)
|
| 96 |
if ideas is None:
|
| 97 |
st.info("Those IDs aren't in the database yet. "
|
| 98 |
+
"Fetch them via the Search tab, then try again.")
|
| 99 |
else:
|
| 100 |
st.markdown(f"```\n{ideas}\n```")
|