Spaces:

stratpoint-archive
/

LLM_research_helper_2

Sleeping

App Files Files Community

PAOLO SANDEJAS commited on May 13, 2024

Commit

cd1ffaa

1 Parent(s): 97da4ac

Add app files

Browse files

Files changed (3) hide show

app.py +488 -0
papers/.DS_Store +0 -0
requirements.txt +233 -0

app.py ADDED Viewed

	@@ -0,0 +1,488 @@

+import pathlib
+import textwrap
+import os
+import re
+import json
+import requests
+import google.generativeai as genai
+from IPython.display import display
+from IPython.display import Markdown
+from chromadb import Documents, EmbeddingFunction, Embeddings
+from pypdf import PdfReader
+from pypdf.errors import PdfReadError
+import chromadb
+from typing import List
+import shutil
+import ast
+# from timeout import timeout, TimeoutError
+import gradio as gr
+PAPERS_DIR = "/Users/paoloantoniosandejas/Documents/experiment-3/experiment-3/initial_experiments/ps/LLM Research Helper v2/papers"
+RAG_DIR = "/Users/paoloantoniosandejas/Documents/experiment-3/experiment-3/initial_experiments/ps/LLM Research Helper v2/RAG/contents"
+gemini_api_key = os.environ.get('GEMINI_API_KEY', '-1')
+genai.configure(api_key=gemini_api_key)
+S2_API_KEY = os.getenv('S2_API_KEY')
+initial_result_limit = 10
+final_result_limit = 5
+# Select relevant fields to pull
+fields = 'title,url,abstract,citationCount,authors,isOpenAccess,fieldsOfStudy,year,journal,openAccessPdf'
+def raw_to_markdown(text):
+    text = text.replace('•', '  *')
+    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
+def markdown_to_raw(markdown_text):
+    """
+    This function converts basic markdown text to raw text.
+    Args:
+        markdown_text: The markdown text string to be converted.
+    Returns:
+        A string containing the raw text equivalent of the markdown text.
+    """
+    # Remove headers
+    text = re.sub(r'#+ ?', '', markdown_text)
+    # Remove bold and italics (can be adjusted based on needs)
+    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)  # Bold
+    text = re.sub(r'_(.+?)_', r'\1', text)        # Italics
+    # Remove code blocks
+    text = re.sub(r'`(.*?)`', '', text, flags=re.DOTALL)
+    # Remove lists
+    text = re.sub(r'\*+ (.*?)$', r'\1', text, flags=re.MULTILINE)  # Unordered lists
+    text.strip()  # Remove extra whitespace
+    return text
+def find_basis_papers(query):
+    papers = None
+    if not query:
+        print('No query given')
+        return None
+    rsp = requests.get('https://api.semanticscholar.org/graph/v1/paper/search',
+                       headers={'X-API-KEY': S2_API_KEY},
+                       params={'query': query, 'limit': initial_result_limit, 'fields': fields})
+    rsp.raise_for_status()
+    results = rsp.json()
+    total = results["total"]
+    if not total:
+        print('No matches found. Please try another query.')
+        return None
+    print(f'Found {total} initial results. Showing up to {initial_result_limit}.')
+    papers = results['data']
+    # print("INITIAL RESULTS")
+    # print_papers(papers)
+    # Filter paper results
+    filtered_papers = list(filter(isValidPaper, papers))
+    # print("FILTERED RESULTS")
+    # print_papers(filtered_papers)
+    # rank paper results
+    ranked_papers = sorted(filtered_papers, key=lambda x: (x['year'], x['citationCount']), reverse=True)
+    # print("RANKED RESULTS")
+    # print_papers(ranked_papers)
+    # return 5 best papers
+    return ranked_papers[0:5]
+# def find_recommendations(paper):
+#     print(f"Up to {result_limit} recommendations based on: {paper['title']}")
+#     rsp = requests.get(f"https://api.semanticscholar.org/recommendations/v1/papers/forpaper/{paper['paperId']}",
+#                        headers={'X-API-KEY': S2_API_KEY},
+#                        params={'fields': fields, 'limit': 10})
+#     rsp.raise_for_status()
+#     results = rsp.json()
+#     print_papers(results['recommendedPapers'])
+#     return results['recommendedPapers']
+def print_papers(papers):
+    for idx, paper in enumerate(papers):
+        print(f"PAPER {idx}")
+        for key, value in paper.items():
+            if key != 'abstract':
+                print(f"\t{key}: '{value}'")
+def isValidPaper(paper):
+    if paper['isOpenAccess'] and paper['abstract'] and paper['openAccessPdf']:
+        return True
+    else:
+        return False
+# def filter_papers(papers):
+#     filtered_papers = []
+#     for paper in papers:
+#         if paper['isOpenAccess'] and paper['abstract'] and paper['openAccessPdf']:
+#             # paper is acceptable
+#             filtered_papers.append(paper)
+#     return filtered_papers
+def load_pdf(file_path):
+    """
+    Reads the text content from a PDF file and returns it as a single string.
+    Parameters:
+    - file_path (str): The file path to the PDF file.
+    Returns:
+    - str: The concatenated text content of all pages in the PDF.
+    """
+    # Logic to read pdf
+    reader = PdfReader(file_path)
+    # Loop over each page and store it in a variable
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+def split_text(text: str):
+    """
+    Splits a text string into a list of non-empty substrings based on the specified pattern.
+    The "\n \n" pattern will split the document para by para
+    Parameters:
+    - text (str): The input text to be split.
+    Returns:
+    - List[str]: A list containing non-empty substrings obtained by splitting the input text.
+    """
+    split_text = re.split('\n \n', text)
+    return [i for i in split_text if i != ""]
+class GeminiEmbeddingFunction(EmbeddingFunction):
+    """
+    Custom embedding function using the Gemini AI API for document retrieval.
+    This class extends the EmbeddingFunction class and implements the __call__ method
+    to generate embeddings for a given set of documents using the Gemini AI API.
+    Parameters:
+    - input (Documents): A collection of documents to be embedded.
+    Returns:
+    - Embeddings: Embeddings generated for the input documents.
+    """
+    def __call__(self, input: Documents) -> Embeddings:
+        gemini_api_key = os.getenv("GEMINI_API_KEY")
+        if not gemini_api_key:
+            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
+        genai.configure(api_key=gemini_api_key)
+        model = "models/embedding-001"
+        title = "Custom query"
+        return genai.embed_content(model=model,
+                                   content=input,
+                                   task_type="retrieval_document",
+                                   title=title)["embedding"]
+def create_chroma_db(documents: List, path: str, name: str):
+    """
+    Creates a Chroma database using the provided documents, path, and collection name.
+    Parameters:
+    - documents: An iterable of documents to be added to the Chroma database.
+    - path (str): The path where the Chroma database will be stored.
+    - name (str): The name of the collection within the Chroma database.
+    Returns:
+    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
+    """
+    chroma_client = chromadb.PersistentClient(path=path)
+    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())
+    for i, d in enumerate(documents):
+        db.add(documents=d, ids=str(i))
+    return db, name
+def load_chroma_collection(path, name):
+    """
+    Loads an existing Chroma collection from the specified path with the given name.
+    Parameters:
+    - path (str): The path where the Chroma database is stored.
+    - name (str): The name of the collection within the Chroma database.
+    Returns:
+    - chromadb.Collection: The loaded Chroma Collection.
+    """
+    chroma_client = chromadb.PersistentClient(path=path)
+    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())
+    return db
+def delete_chroma_collection(path, name):
+    chroma_client = chromadb.PersistentClient(path=path)
+    chroma_client.delete_collection(name=name)
+def delete_all_paper_dbs(papers):
+    for idx in range(len(papers)):
+        delete_chroma_collection(path=RAG_DIR,
+                                 name=f"paper_{idx}")
+def get_relevant_passage(query, db, n_results):
+    passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
+    return passage
+def make_rag_prompt(query, relevant_passage):
+    escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
+    prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
+    Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
+    However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
+    strike a friendly and converstional tone. \
+    If the passage is irrelevant to the answer, you may ignore it.
+    QUESTION: '{query}'
+    PASSAGE: '{relevant_passage}'
+    ANSWER:
+    """).format(query=query, relevant_passage=escaped)
+    return prompt
+def generate_answer_prompt(prompt):
+    gemini_api_key = os.getenv("GEMINI_API_KEY")
+    if not gemini_api_key:
+        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
+    genai.configure(api_key=gemini_api_key)
+    model = genai.GenerativeModel('gemini-pro')
+    answer = model.generate_content(prompt)
+    return answer.text
+def generate_answer_db(db,query):
+    # retrieve top 3 relevant text chunks
+    relevant_text = get_relevant_passage(query,db,n_results=3)
+    # print(relevant_text)
+    prompt = make_rag_prompt(query,
+                             relevant_passage="".join(relevant_text))  # joining the relevant chunks to create a single passage
+    answer = generate_answer_prompt(prompt)
+    return answer
+def pull_paper(paper_url, filepath):
+    r = requests.get(paper_url)
+    with open(filepath, 'wb') as outfile:
+        outfile.write(r.content)
+def RAG_create_paper_dbs(papers):
+    if os.path.exists('RAG/contents') and os.path.isdir('RAG/contents'):
+        # Delete current dbs
+        shutil.rmtree('RAG/contents')
+    vector_dbs = {} # key: name, value: db
+    urls = [p['openAccessPdf']['url'] if p['openAccessPdf'] else None for p in papers]
+    print(urls)
+    for idx, test_paper in enumerate(papers):
+        # Get full paper
+        paper_title = test_paper['title']
+        paper_primary_author = test_paper['authors'][0]['name']
+        paper_url = test_paper['openAccessPdf']['url'] if test_paper['openAccessPdf'] else None
+        paper_year = test_paper['year']
+        paper_abstract = test_paper['abstract']
+        filename = f"{paper_primary_author} {paper_year} - {paper_title}.pdf"
+        filepath = f"{PAPERS_DIR}/{filename}"
+        print(f'getting {filename}...')
+        # SKIP FOR NOW - PAPERS ALREADY SAVED
+        skip_idxs = []
+        if idx not in skip_idxs and paper_url is not None:
+            try:
+                pull_paper(paper_url, filepath)
+            except TimeoutError:
+                print("Paper taking too long...")
+        print('\t- DONE!')
+        # intialize to abstract
+        pdf_text = paper_abstract
+        try:
+            PdfReader(filepath)
+            pdf_text = load_pdf(file_path=filepath)
+        except:
+            print("\t- invalid PDF file! Using abstract as fallback")
+        print(f'saving {filename} as a vector db...')
+        # Save paper as vector DB
+        # pdf_text = load_pdf(file_path=filepath)
+        chunked_text = split_text(text=pdf_text)
+        db, name = create_chroma_db(documents=chunked_text,
+                                    path=RAG_DIR,
+                                    name=f"paper_{idx}")
+        vector_dbs[name] = db
+        print('\t- DONE!')
+    return vector_dbs
+def ask_all_papers(vector_dbs, query):
+    answers = {}
+    for name, db in vector_dbs.items():
+        db = load_chroma_collection(path="RAG/contents", name=name)
+        answer = generate_answer_db(db, query=query)
+        # print(f"{name} answer: {answer}\n\n")
+        answers[name] = answer
+    return answers
+def GEMINI_list_features(answers):
+    generation_config = {
+        "temperature": 0.5
+        # "top_p": 0.95,
+        # "top_k": 0,
+        # "max_output_tokens": 8192,
+    }
+    model = genai.GenerativeModel(model_name='gemini-pro', generation_config=generation_config)
+    chat = model.start_chat(history=[])
+    prompt = f"""Given the following lists of variables considered,
+    return a list of the common variables. Only return a python list.
+    LISTS OF VARIABLES CONSIDERED: {answers}"""
+    response = chat.send_message(prompt)
+    response = markdown_to_raw(response.text)
+    return response
+def GEMINI_predict_target(initial_query: str):
+    # initialize gemini LLM
+    model = genai.GenerativeModel('gemini-pro')
+    chat = model.start_chat(history=[])
+    prompt = f"""Given this search query, what does the user want to predict?
+    QUERY: {initial_query}.
+    Only return the answer"""
+    response = chat.send_message(prompt)
+    predict_target = markdown_to_raw(response.text)
+    return predict_target
+def GEMINI_optimize_query(initial_query: str):
+    # initialize gemini LLM
+    model = genai.GenerativeModel('gemini-pro')
+    chat = model.start_chat(history=[])
+    prompt = f"""Given a search query, return an optimized version of the query to find related academic papers
+    QUERY: {initial_query}.
+    Only return the optimized query. If you feel the query is already concise and optimized, return the original query"""
+    response = chat.send_message(prompt)
+    optimized_query = markdown_to_raw(response.text)
+    return optimized_query
+def GEMINI_summarize_abstracts(initial_query: str, papers: str):
+    # initialize gemini LLM
+    model = genai.GenerativeModel('gemini-pro')
+    chat = model.start_chat(history=[])
+    prompt = f"""Given the following academic papers,
+    return a review of related literature for the search query: {query}.
+    Focus on data/key factors and methodologies considered.
+    Here are the papers {papers}
+    Include the paper urls at the end of the review of related literature.
+    """
+    response = chat.send_message(prompt)
+    abstract_summary = markdown_to_raw(response.text)
+    return abstract_summary
+def predict(message, history):
+    # if message == "delete":
+    #     delete_all_paper_dbs(papers)
+    # if history == []:
+    predict_target = GEMINI_predict_target(message)
+    papers = find_basis_papers(message)
+    vector_dbs = RAG_create_paper_dbs(papers)
+    # predict_target = 'solar site score'
+    answers = ask_all_papers(vector_dbs, f"list the independent variables considered to predict {predict_target}")
+    feature_list = GEMINI_list_features(answers)
+    res = ast.literal_eval(feature_list)
+    response = f"""
+        COMMON FEATURES TO CONSIDER: {res}
+        vectordb answers: {answers}
+    """
+    delete_all_paper_dbs(papers)
+    # response = summarizer_chat.send_message(message)
+    # response_text = markdown_to_raw(response.text)
+    return response
+def main():
+    # GEMINI optimizes query
+    gr.ChatInterface(
+        predict,
+        title="LLM Research Helper",
+        description="""Start by inputing a brief description/title
+        of your research and our assistant will return a review of
+        related literature
+        ex. Finding optimal site locations for solar farms"""
+    ).launch(debug=True)
+if __name__ == '__main__':
+    main()

papers/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,233 @@

+aiofiles==23.2.1
+aiohttp==3.9.4
+aiosignal==1.3.1
+altair==5.3.0
+annotated-types==0.6.0
+anyio==4.3.0
+appnope==0.1.4
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+arxiv==2.1.0
+asgiref==3.8.1
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==23.2.0
+Babel==2.14.0
+backoff==2.2.1
+bcrypt==4.1.3
+beautifulsoup4==4.12.3
+bleach==6.1.0
+build==1.2.1
+cachetools==5.3.3
+certifi==2024.2.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+chroma-hnswlib==0.7.3
+chromadb==0.4.22
+click==8.1.7
+coloredlogs==15.0.1
+comm==0.2.2
+contourpy==1.2.1
+cycler==0.12.1
+dataclasses-json==0.6.5
+datasets==2.18.0
+debugpy==1.8.1
+decorator==5.1.1
+defusedxml==0.7.1
+Deprecated==1.2.14
+dill==0.3.8
+distro==1.9.0
+evaluate==0.4.1
+executing==2.0.1
+fastapi==0.110.1
+fastjsonschema==2.19.1
+feedparser==6.0.10
+ffmpy==0.3.2
+filelock==3.13.4
+flatbuffers==24.3.25
+fonttools==4.51.0
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.2.0
+google-ai-generativelanguage==0.6.2
+google-api-core==2.19.0
+google-api-python-client==2.125.0
+google-auth==2.29.0
+google-auth-httplib2==0.2.0
+google-generativeai==0.5.2
+googleapis-common-protos==1.63.0
+gradio==4.28.3
+gradio_client==0.16.0
+grpcio==1.62.1
+grpcio-status==1.62.1
+h11==0.14.0
+httpcore==1.0.5
+httplib2==0.22.0
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.22.2
+humanfriendly==10.0
+idna==3.7
+importlib-metadata==7.0.0
+importlib_resources==6.4.0
+ipykernel==6.29.4
+ipython==8.23.0
+ipywidgets==8.1.2
+isoduration==20.11.0
+jedi==0.19.1
+Jinja2==3.1.3
+jiwer==3.0.3
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+jupyter==1.0.0
+jupyter-console==6.6.3
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.1
+jupyter_core==5.7.2
+jupyter_server==2.14.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.1.6
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.26.0
+jupyterlab_widgets==3.0.10
+kiwisolver==1.4.5
+kubernetes==29.0.0
+langchain==0.1.17
+langchain-community==0.0.36
+langchain-core==0.1.50
+langchain-google-genai==1.0.3
+langchain-text-splitters==0.0.1
+langsmith==0.1.54
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.2
+matplotlib==3.8.4
+matplotlib-inline==0.1.6
+mdurl==0.1.2
+mistune==3.0.2
+mmh3==4.1.0
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.3
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook==7.1.2
+notebook_shim==0.2.4
+numpy==1.26.4
+oauthlib==3.2.2
+onnxruntime==1.17.3
+openai==1.17.1
+opentelemetry-api==1.24.0
+opentelemetry-exporter-otlp-proto-common==1.24.0
+opentelemetry-exporter-otlp-proto-grpc==1.24.0
+opentelemetry-instrumentation==0.45b0
+opentelemetry-instrumentation-asgi==0.45b0
+opentelemetry-instrumentation-fastapi==0.45b0
+opentelemetry-proto==1.24.0
+opentelemetry-sdk==1.24.0
+opentelemetry-semantic-conventions==0.45b0
+opentelemetry-util-http==0.45b0
+orjson==3.10.0
+overrides==7.7.0
+packaging==23.2
+pandas==2.2.2
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==10.3.0
+platformdirs==4.2.0
+posthog==3.5.0
+prometheus_client==0.20.0
+prompt-toolkit==3.0.43
+proto-plus==1.23.0
+protobuf==4.25.3
+psutil==5.9.8
+ptyprocess==0.7.0
+pulsar-client==3.5.0
+pure-eval==0.2.2
+pyarrow==15.0.2
+pyarrow-hotfix==0.6
+pyasn1==0.6.0
+pyasn1_modules==0.4.0
+pycparser==2.22
+pydantic==2.7.0
+pydantic_core==2.18.1
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.2
+pypdf==4.0.0
+PyPDF2==3.0.1
+PyPika==0.48.9
+pyproject_hooks==1.1.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+pyzmq==25.1.2
+qtconsole==5.5.1
+QtPy==2.4.1
+rapidfuzz==3.8.1
+referencing==0.34.0
+requests==2.31.0
+requests-oauthlib==2.0.0
+responses==0.18.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.7.1
+rpds-py==0.18.0
+rsa==4.9
+ruff==0.3.7
+semantic-version==2.10.0
+Send2Trash==1.8.3
+setuptools==68.2.2
+sgmllib3k==1.0.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.5
+SQLAlchemy==2.0.30
+stack-data==0.6.3
+starlette==0.37.2
+sympy==1.12
+tenacity==8.2.3
+terminado==0.18.1
+tinycss2==1.2.1
+tokenizers==0.19.1
+tomlkit==0.12.0
+toolz==0.12.1
+tornado==6.4
+tqdm==4.66.2
+traitlets==5.14.2
+typer==0.12.3
+types-python-dateutil==2.9.0.20240316
+typing-inspect==0.9.0
+typing_extensions==4.11.0
+tzdata==2024.1
+uri-template==1.3.0
+uritemplate==4.1.1
+urllib3==2.2.1
+uvicorn==0.29.0
+uvloop==0.19.0
+watchfiles==0.21.0
+wcwidth==0.2.13
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.7.0
+websockets==11.0.3
+wheel==0.41.2
+widgetsnbextension==4.0.10
+wrapt==1.16.0
+xxhash==3.4.1
+yarl==1.9.4
+zipp==3.18.1