Spaces:
Sleeping
Sleeping
| from langchain.tools import BaseTool, StructuredTool, tool | |
| from langchain_community.retrievers import ArxivRetriever | |
| #from langchain_community.utilities import SerpAPIWrapper | |
| from langchain_community.tools import WikipediaQueryRun | |
| from langchain_community.utilities import WikipediaAPIWrapper | |
| #from langchain.tools import Tool | |
| from langchain_google_community import GoogleSearchAPIWrapper | |
| from langchain_community.embeddings.sentence_transformer import ( | |
| SentenceTransformerEmbeddings, | |
| ) | |
| from langchain_community.embeddings import GPT4AllEmbeddings | |
| from app.core.config import settings | |
| from langchain_community.vectorstores import Chroma | |
| import arxiv | |
| #import ast | |
| import chromadb | |
| # hacky and should be replaced with a database | |
| # from app.source_container.container import ( | |
| # all_sources | |
| # ) | |
| from app.utils.utils import ( | |
| parse_list_to_dicts, format_wiki_summaries, format_arxiv_documents, format_search_results | |
| ) | |
| from app.crud.db_handler import ( | |
| add_many | |
| ) | |
| from app.vector_store.chroma_vector_store import ( | |
| add_pdf_to_vector_store | |
| ) | |
| from app.utils.utils import ( | |
| create_wikipedia_urls_from_text, create_folder_if_not_exists, | |
| ) | |
| import os | |
| # from app.utils import create_wikipedia_urls_from_text | |
| #persist_directory = os.getenv('VECTOR_DATABASE_LOCATION') | |
| persist_directory = settings.VECTOR_DATABASE_LOCATION | |
| def memory_search(query:str) -> str: | |
| """Search the memory vector store for existing knowledge and relevent pervious researches. \ | |
| This is your primary source to start your search with checking what you already have learned from the past, before going online.""" | |
| # Since we have more than one collections we should change the name of this tool | |
| client = chromadb.PersistentClient( | |
| path=persist_directory, | |
| ) | |
| #collection_name = os.getenv('CONVERSATION_COLLECTION_NAME') | |
| collection_name = settings.CONVERSATION_COLLECTION_NAME | |
| #store using envar | |
| embedding_function = SentenceTransformerEmbeddings( | |
| model_name=settings.EMBEDDING_MODEL | |
| #model_name=os.getenv("EMBEDDING_MODEL"), | |
| ) | |
| #embedding_function = GPT4AllEmbeddings() | |
| vector_db = Chroma( | |
| client=client, # client for Chroma | |
| collection_name=collection_name, | |
| embedding_function=embedding_function, | |
| ) | |
| retriever = vector_db.as_retriever() | |
| docs = retriever.get_relevant_documents(query) | |
| return docs.__str__() | |
| def knowledgeBase_search(query:str) -> str: | |
| """Search the internal knowledge base for research papers and relevent chunks""" | |
| # Since we have more than one collections we should change the name of this tool | |
| client = chromadb.PersistentClient( | |
| path=persist_directory, | |
| ) | |
| collection_name="ArxivPapers" | |
| #store using envar | |
| embedding_function = SentenceTransformerEmbeddings( | |
| #model_name=os.getenv("EMBEDDING_MODEL"), | |
| model_name=settings.EMBEDDING_MODEL | |
| ) | |
| #embedding_function = GPT4AllEmbeddings() | |
| vector_db = Chroma( | |
| client=client, # client for Chroma | |
| collection_name=collection_name, | |
| embedding_function=embedding_function, | |
| ) | |
| retriever = vector_db.as_retriever() | |
| docs = retriever.get_relevant_documents(query) | |
| return docs.__str__() | |
| def arxiv_search(query: str) -> str: | |
| """Search arxiv database for scientific research papers and studies. This is your primary online information source. | |
| always check it first when you search for additional information, before using any other online tool.""" | |
| #global all_sources | |
| arxiv_retriever = ArxivRetriever(load_max_docs=3) | |
| data = arxiv_retriever.invoke(query) | |
| meta_data = [i.metadata for i in data] | |
| formatted_sources = format_arxiv_documents(data) | |
| #all_sources += formatted_sources | |
| parsed_sources = parse_list_to_dicts(formatted_sources) | |
| add_many(parsed_sources) | |
| return data.__str__() | |
| def get_arxiv_paper(paper_id:str) -> None: | |
| """Download a paper from axriv to download a paper please input | |
| the axriv id such as "1605.08386v1" This tool is named get_arxiv_paper | |
| If you input "http://arxiv.org/abs/2312.02813", This will break the code. Also only do | |
| "2312.02813". In addition please download one paper at a time. Pleaase keep the inputs/output | |
| free of additional information only have the id. | |
| """ | |
| # code from https://lukasschwab.me/arxiv.py/arxiv.html | |
| paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id]))) | |
| number_without_period = paper_id.replace('.', '') | |
| # Download the PDF to a specified directory with a custom filename. | |
| paper.download_pdf(dirpath="./downloaded_papers", filename=f"{number_without_period}.pdf") | |
| def embed_arvix_paper(paper_id:str) -> None: | |
| """Download a paper from axriv to download a paper please input | |
| the axriv id such as "1605.08386v1" This tool is named get_arxiv_paper | |
| If you input "http://arxiv.org/abs/2312.02813", This will break the code. Also only do | |
| "2312.02813". In addition please download one paper at a time. Pleaase keep the inputs/output | |
| free of additional information only have the id. | |
| """ | |
| # code from https://lukasschwab.me/arxiv.py/arxiv.html | |
| paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id]))) | |
| number_without_period = paper_id.replace('.', '') | |
| pdf_file_name = f"{number_without_period}.pdf" | |
| pdf_directory = "./downloaded_papers" | |
| create_folder_if_not_exists(pdf_directory) | |
| # Download the PDF to a specified directory with a custom filename. | |
| paper.download_pdf(dirpath=pdf_directory, filename=f"{number_without_period}.pdf") | |
| client = chromadb.PersistentClient( | |
| path=persist_directory, | |
| ) | |
| collection_name="ArxivPapers" | |
| #store using envar | |
| full_path = os.path.join(pdf_directory, pdf_file_name) | |
| add_pdf_to_vector_store( | |
| collection_name=collection_name, | |
| pdf_file_location=full_path, | |
| ) | |
| def wikipedia_search(query: str) -> str: | |
| """Search Wikipedia for additional information to expand on research papers or when no papers can be found.""" | |
| #global all_sources | |
| api_wrapper = WikipediaAPIWrapper() | |
| wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper) | |
| wikipedia_results = wikipedia_search.run(query) | |
| formatted_summaries = format_wiki_summaries(wikipedia_results) | |
| #all_sources += formatted_summaries | |
| parsed_summaries = parse_list_to_dicts(formatted_summaries) | |
| add_many(parsed_summaries) | |
| #all_sources += create_wikipedia_urls_from_text(wikipedia_results) | |
| return wikipedia_results | |
| def google_search(query: str) -> str: | |
| """Search Google for additional results when you can't answer questions using arxiv search or wikipedia search.""" | |
| #global all_sources | |
| websearch = GoogleSearchAPIWrapper() | |
| search_results:dict = websearch.results(query, 3) | |
| cleaner_sources =format_search_results(search_results) | |
| parsed_csources = parse_list_to_dicts(cleaner_sources) | |
| add_many(parsed_csources) | |
| #all_sources += cleaner_sources | |
| return cleaner_sources.__str__() |