Sam-Oliveira commited on
Commit
792575c
ยท
1 Parent(s): fd2e156

Change TAB 1 retrieval logic

Browse files
Files changed (2) hide show
  1. src/scrape.py +29 -23
  2. src/streamlit_app.py +9 -9
src/scrape.py CHANGED
@@ -61,42 +61,48 @@ def make_tags(title, abstract, top_n=5):
61
  def scrape(max_results=MAX_RESULTS, **criteria):
62
  query = build_query(**criteria)
63
  search = arxiv.Search(query=query,
64
- max_results=max_results,
65
  sort_by=arxiv.SortCriterion.SubmittedDate)
66
 
67
  conn = get_conn()
68
- scraped_papers = [] # Track papers that were just scraped
 
69
 
70
  for p in search.results():
71
- tags = make_tags(p.title, p.summary)
72
-
73
- # Check if paper already exists
74
  existing = conn.execute("SELECT id FROM papers WHERE id=?", (p.entry_id,)).fetchone()
75
 
76
- conn.execute(
77
- "INSERT OR IGNORE INTO papers VALUES (?,?,?,?,?,?,?)",
78
- (
79
- p.entry_id,
80
- p.title,
81
- ", ".join(a.name for a in p.authors),
82
- p.summary,
83
- p.published.isoformat(),
84
- None, # summary placeholder
85
- tags
86
- ),
87
- )
88
-
89
- # If paper was newly inserted (not ignored), add to scraped_papers
90
- if not existing:
91
- scraped_papers.append({
 
 
92
  'title': p.title,
93
  'authors': ", ".join(a.name for a in p.authors),
94
  'abstract': p.summary,
95
  'published': p.published.isoformat()
96
  })
 
97
 
 
 
 
 
98
  time.sleep(1)
99
- conn.commit()
100
 
101
- return scraped_papers
 
102
 
 
61
  def scrape(max_results=MAX_RESULTS, **criteria):
62
  query = build_query(**criteria)
63
  search = arxiv.Search(query=query,
64
+ max_results=max_results * 3, # Get more results to filter from
65
  sort_by=arxiv.SortCriterion.SubmittedDate)
66
 
67
  conn = get_conn()
68
+ search_results = [] # Track papers from current search that aren't in database
69
+ papers_added = 0
70
 
71
  for p in search.results():
72
+ # Check if paper already exists in database
 
 
73
  existing = conn.execute("SELECT id FROM papers WHERE id=?", (p.entry_id,)).fetchone()
74
 
75
+ if not existing and papers_added < max_results:
76
+ # Paper doesn't exist, add it
77
+ tags = make_tags(p.title, p.summary)
78
+ conn.execute(
79
+ "INSERT INTO papers VALUES (?,?,?,?,?,?,?)",
80
+ (
81
+ p.entry_id,
82
+ p.title,
83
+ ", ".join(a.name for a in p.authors),
84
+ p.summary,
85
+ p.published.isoformat(),
86
+ None, # ummary placeholder
87
+ tags
88
+ ),
89
+ )
90
+
91
+ # Add to search results
92
+ search_results.append({
93
  'title': p.title,
94
  'authors': ", ".join(a.name for a in p.authors),
95
  'abstract': p.summary,
96
  'published': p.published.isoformat()
97
  })
98
+ papers_added += 1
99
 
100
+ # Stop if enough papers have been added
101
+ if papers_added >= max_results:
102
+ break
103
+
104
  time.sleep(1)
 
105
 
106
+ conn.commit()
107
+ return search_results
108
 
src/streamlit_app.py CHANGED
@@ -37,7 +37,7 @@ tab1, tab2, tab3 = st.tabs(["๐Ÿ” Search", "๐Ÿ“‘ Digest", "๐Ÿ’ก Ideate"])
37
 
38
 
39
  with tab1:
40
- st.header("Search papers")
41
  c1, c2, c3, c4 = st.columns(4)
42
  topic = c1.text_input("Topic")
43
  title = c2.text_input("Title")
@@ -45,18 +45,18 @@ with tab1:
45
  category = c4.text_input("Category (e.g. cs.CL)")
46
  k = st.slider("Max papers", 5, 50, 25)
47
  if st.button("Run search"):
48
- with st.spinner("Scraping papers, and storing them..."):
49
- scraped_papers = scrape(max_results=k, topic=topic, title=title,
50
  author=author, category=category)
51
- st.success("All done!")
52
 
53
- if scraped_papers:
54
- # Convert scraped papers to the format expected by render_rows
 
55
  paper_rows = [(p['title'], p['authors'], p['abstract'], p['published'])
56
- for p in scraped_papers]
57
  st.components.v1.html(render_rows(paper_rows), height=600, scrolling=True)
58
  else:
59
- st.info("No new papers were found. All papers from this search already exist in the database.")
60
 
61
 
62
  with tab2:
@@ -95,6 +95,6 @@ with tab3:
95
  ideas = ideate_from_ids(ids)
96
  if ideas is None:
97
  st.info("Those IDs aren't in the database yet. "
98
- "Fetch them via the **Search** tab, then try again.")
99
  else:
100
  st.markdown(f"```\n{ideas}\n```")
 
37
 
38
 
39
  with tab1:
40
+ st.header("Search for papers you have not yet read")
41
  c1, c2, c3, c4 = st.columns(4)
42
  topic = c1.text_input("Topic")
43
  title = c2.text_input("Title")
 
45
  category = c4.text_input("Category (e.g. cs.CL)")
46
  k = st.slider("Max papers", 5, 50, 25)
47
  if st.button("Run search"):
48
+ with st.spinner("Finding new papers for your search..."):
49
+ search_results = scrape(max_results=k, topic=topic, title=title,
50
  author=author, category=category)
 
51
 
52
+ if search_results:
53
+ st.success(f"Found {len(search_results)} new papers for your search!")
54
+ # Convert search results to the format expected by render_rows
55
  paper_rows = [(p['title'], p['authors'], p['abstract'], p['published'])
56
+ for p in search_results]
57
  st.components.v1.html(render_rows(paper_rows), height=600, scrolling=True)
58
  else:
59
+ st.info("No new papers found for this search. All recent papers on this topic are already in your database.")
60
 
61
 
62
  with tab2:
 
95
  ideas = ideate_from_ids(ids)
96
  if ideas is None:
97
  st.info("Those IDs aren't in the database yet. "
98
+ "Fetch them via the Search tab, then try again.")
99
  else:
100
  st.markdown(f"```\n{ideas}\n```")