BRAINIAC2677 commited on
Commit
e1eba48
·
1 Parent(s): 39a70a4
Files changed (6) hide show
  1. Pipfile +19 -0
  2. Pipfile.lock +0 -0
  3. README.md +13 -13
  4. app/__init__.py +0 -0
  5. app/search.py +70 -0
  6. main.py +38 -0
Pipfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [packages]
7
+ streamlit = "*"
8
+ langchain = "*"
9
+ ollama = "*"
10
+ llama-index = "*"
11
+ tiktoken = "*"
12
+ faiss-cpu = "*"
13
+ arxiv = "*"
14
+
15
+ [dev-packages]
16
+
17
+ [requires]
18
+ python_version = "3.12"
19
+ python_full_version = "3.12.1"
Pipfile.lock ADDED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1,14 +1,14 @@
1
- ---
2
- title: Paper Scholar
3
- emoji: 📊
4
- colorFrom: purple
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.41.1
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: Paper Scholar is a research paper search and analysis tool.
12
- ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Paper Scholar
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ Paper Scholar is a research paper search and analysis tool that integrates open-source LLMs for document understanding and querying.
4
+
5
+ ## Features
6
+ - Search for research papers from arXiv or Google Scholar.
7
+ - Chatbox to query specific papers.
8
+ - Dark-themed UI with yellow highlights.
9
+
10
+ ## Installation
11
+ 1. Clone the repository and open it in a GitHub Codespace.
12
+ 2. Install dependencies:
13
+ ```bash
14
+ pipenv install
app/__init__.py ADDED
File without changes
app/search.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import arxiv
2
+ import faiss
3
+ from langchain.vectorstores import FAISS
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_community.docstore.in_memory import InMemoryDocstore
6
+
7
+
8
+ def fetch_papers(query, max_results=60):
9
+ search = arxiv.Search(
10
+ query=query,
11
+ max_results=max_results,
12
+ sort_by=arxiv.SortCriterion.Relevance
13
+ )
14
+ papers = []
15
+ for result in search.results():
16
+ papers.append({
17
+ "title": result.title,
18
+ "summary": result.summary,
19
+ "url": result.entry_id
20
+ })
21
+ return papers
22
+
23
+
24
+ # Initialize embeddings and FAISS vector store
25
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
26
+ index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
27
+ vector_store = FAISS(
28
+ embedding_function=embeddings,
29
+ index=index,
30
+ docstore=InMemoryDocstore(),
31
+ index_to_docstore_id={},
32
+ )
33
+
34
+
35
+ def index_papers(papers, vector_store=vector_store):
36
+ new_papers = []
37
+ for paper in papers:
38
+ # Check if a document with the same URL already exists
39
+ existing_docs = vector_store.similarity_search_with_score(
40
+ query="", # You'll need to provide a query here
41
+ n_results=1,
42
+ filter={"url": paper["url"]}
43
+ )
44
+
45
+ if not existing_docs:
46
+ new_papers.append(paper)
47
+
48
+ if new_papers:
49
+ documents = [
50
+ {"text": paper["summary"], "metadata": {"title": paper["title"], "url": paper["url"]}}
51
+ for paper in new_papers
52
+ ]
53
+ vector_store.add_texts(
54
+ texts=[doc["text"] for doc in documents],
55
+ metadatas=[doc["metadata"] for doc in documents]
56
+ )
57
+
58
+ return vector_store
59
+
60
+
61
+ def search_papers(query, vector_store, top_k=5):
62
+ results = vector_store.similarity_search(query, k=top_k)
63
+ return [{"title": result.metadata["title"], "summary": result.page_content, "url": result.metadata["url"]} for result in results]
64
+
65
+
66
+
67
+
68
+
69
+
70
+
main.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from app.search import fetch_papers, index_papers, search_papers, vector_store
3
+
4
+
5
+ # Set page configuration
6
+ st.set_page_config(
7
+ page_title="Paper Scholar",
8
+ page_icon=":page_with_curl:",
9
+ layout="centered",
10
+ initial_sidebar_state="expanded"
11
+ )
12
+
13
+ st.title(":page_with_curl: Paper Scholar")
14
+
15
+ # User control for number of shown papers
16
+ n_shown_paper = st.slider("Number of papers to display:", min_value=1, max_value=20, value=5, step=1)
17
+ search_multiplier = 5
18
+ top_k = n_shown_paper
19
+ max_results = search_multiplier * top_k
20
+
21
+ # Search bar for papers
22
+ query = st.text_input("Search for research papers:")
23
+ if query:
24
+ with st.spinner("Fetching and indexing papers..."):
25
+ papers = fetch_papers(query, max_results=max_results)
26
+ vector_store = index_papers(papers)
27
+ results = search_papers(query, vector_store, top_k=top_k)
28
+
29
+ st.subheader("Search Results")
30
+ for result in results:
31
+ # Display title with a link to the full paper
32
+ st.markdown(f"### [{result['title']}]({result['url']})")
33
+
34
+ # Foldable summary using expander
35
+ with st.expander("View Summary"):
36
+ st.write(result['summary'])
37
+
38
+ st.markdown("---")