Spaces:

SamOliveira
/

research_assistant

Sleeping

App Files Files Community

Sam-Oliveira commited on Jun 23, 2025

Commit

792575c

1 Parent(s): fd2e156

Change TAB 1 retrieval logic

Browse files

Files changed (2) hide show

src/scrape.py +29 -23
src/streamlit_app.py +9 -9

src/scrape.py CHANGED Viewed

@@ -61,42 +61,48 @@ def make_tags(title, abstract, top_n=5):
 def scrape(max_results=MAX_RESULTS, **criteria):
     query  = build_query(**criteria)
     search = arxiv.Search(query=query,
-                          max_results=max_results,
                           sort_by=arxiv.SortCriterion.SubmittedDate)
     conn = get_conn()
-    scraped_papers = []  # Track papers that were just scraped
     for p in search.results():
-        tags = make_tags(p.title, p.summary)
-        # Check if paper already exists
         existing = conn.execute("SELECT id FROM papers WHERE id=?", (p.entry_id,)).fetchone()
-        conn.execute(
-            "INSERT OR IGNORE INTO papers VALUES (?,?,?,?,?,?,?)",
-            (
-                p.entry_id,
-                p.title,
-                ", ".join(a.name for a in p.authors),
-                p.summary,
-                p.published.isoformat(),
-                None,          # summary placeholder
-                tags
-            ),
-        )
-        # If paper was newly inserted (not ignored), add to scraped_papers
-        if not existing:
-            scraped_papers.append({
                 'title': p.title,
                 'authors': ", ".join(a.name for a in p.authors),
                 'abstract': p.summary,
                 'published': p.published.isoformat()
             })
         time.sleep(1)
-    conn.commit()
-    return scraped_papers

 def scrape(max_results=MAX_RESULTS, **criteria):
     query  = build_query(**criteria)
     search = arxiv.Search(query=query,
+                          max_results=max_results * 3,  # Get more results to filter from
                           sort_by=arxiv.SortCriterion.SubmittedDate)
     conn = get_conn()
+    search_results = []  # Track papers from current search that aren't in database
+    papers_added = 0
     for p in search.results():
+        # Check if paper already exists in database
         existing = conn.execute("SELECT id FROM papers WHERE id=?", (p.entry_id,)).fetchone()
+        if not existing and papers_added < max_results:
+            # Paper doesn't exist, add it
+            tags = make_tags(p.title, p.summary)
+            conn.execute(
+                "INSERT INTO papers VALUES (?,?,?,?,?,?,?)",
+                (
+                    p.entry_id,
+                    p.title,
+                    ", ".join(a.name for a in p.authors),
+                    p.summary,
+                    p.published.isoformat(),
+                    None,          #  ummary placeholder
+                    tags
+                ),
+            )
+            # Add to search results
+            search_results.append({
                 'title': p.title,
                 'authors': ", ".join(a.name for a in p.authors),
                 'abstract': p.summary,
                 'published': p.published.isoformat()
             })
+            papers_added += 1
+        # Stop if enough papers have been added
+        if papers_added >= max_results:
+            break
         time.sleep(1)
+    conn.commit()
+    return search_results

src/streamlit_app.py CHANGED Viewed

@@ -37,7 +37,7 @@ tab1, tab2, tab3 = st.tabs(["🔍 Search", "📑 Digest", "💡 Ideate"])
 with tab1:
-    st.header("Search papers")
     c1, c2, c3, c4 = st.columns(4)
     topic    = c1.text_input("Topic")
     title    = c2.text_input("Title")
@@ -45,18 +45,18 @@ with tab1:
     category = c4.text_input("Category (e.g. cs.CL)")
     k = st.slider("Max papers", 5, 50, 25)
     if st.button("Run search"):
-        with st.spinner("Scraping papers, and storing them..."):
-            scraped_papers = scrape(max_results=k, topic=topic, title=title,
                author=author, category=category)
-        st.success("All done!")
-        if scraped_papers:
-            # Convert scraped papers to the format expected by render_rows
             paper_rows = [(p['title'], p['authors'], p['abstract'], p['published'])
-                         for p in scraped_papers]
             st.components.v1.html(render_rows(paper_rows), height=600, scrolling=True)
         else:
-            st.info("No new papers were found. All papers from this search already exist in the database.")
 with tab2:
@@ -95,6 +95,6 @@ with tab3:
                 ideas = ideate_from_ids(ids)
             if ideas is None:
                 st.info("Those IDs aren't in the database yet. "
-                        "Fetch them via the **Search** tab, then try again.")
             else:
                 st.markdown(f"```\n{ideas}\n```")

 with tab1:
+    st.header("Search for papers you have not yet read")
     c1, c2, c3, c4 = st.columns(4)
     topic    = c1.text_input("Topic")
     title    = c2.text_input("Title")
     category = c4.text_input("Category (e.g. cs.CL)")
     k = st.slider("Max papers", 5, 50, 25)
     if st.button("Run search"):
+        with st.spinner("Finding new papers for your search..."):
+            search_results = scrape(max_results=k, topic=topic, title=title,
                author=author, category=category)
+        if search_results:
+            st.success(f"Found {len(search_results)} new papers for your search!")
+            # Convert search results to the format expected by render_rows
             paper_rows = [(p['title'], p['authors'], p['abstract'], p['published'])
+                         for p in search_results]
             st.components.v1.html(render_rows(paper_rows), height=600, scrolling=True)
         else:
+            st.info("No new papers found for this search. All recent papers on this topic are already in your database.")
 with tab2:
                 ideas = ideate_from_ids(ids)
             if ideas is None:
                 st.info("Those IDs aren't in the database yet. "
+                        "Fetch them via the Search tab, then try again.")
             else:
                 st.markdown(f"```\n{ideas}\n```")