Spaces:

Dakhoo
/

InQuest

Paused

App Files Files Community

Dakhoo commited on Aug 28, 2023

Commit

f3f7425

1 Parent(s): 12e0acb

fixed uploader type

Browse files

Files changed (4) hide show

.flake8 +1 -0
.gitignore +163 -0
.pre-commit-config.yaml +59 -0
app.py +331 -2

.flake8 ADDED Viewed

	@@ -0,0 +1 @@


1	+ ignore = E501

.gitignore ADDED Viewed

	@@ -0,0 +1,163 @@

+tempdir/*
+hf_model/*
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+repos:
+  - repo: https://github.com/psf/black
+    rev: 23.3.0
+    hooks:
+      - id: black
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: ["--profile", "black"]
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+      - id: flake8
+        exclude: .*/tests|^sandbox
+        additional_dependencies: [flake8-docstrings]
+        args:
+          [
+            "--max-line-length=88",
+            "--extend-ignore=E203,W503",
+            "--docstring-convention",
+            "google",
+          ]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: requirements-txt-fixer
+        files: .*/requirements.*\.txt$
+      - id: check-json
+        exclude: '^data/.*'
+      - id: check-yaml
+        exclude: '^applications/.*/charts/.*\.yaml$'
+      - id: check-added-large-files
+      - id: check-merge-conflict
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.3.0
+    hooks:
+      - id: mypy
+        args: [--ignore-missing-imports, --disallow-untyped-defs, --install-types, --non-interactive]
+        exclude: .*/tests|^sandbox
+  - repo: local
+    hooks:
+    - id: hadolint
+      name: hadolint
+      entry: hadolint/hadolint:v2.12.1-beta hadolint --ignore DL3008 --no-color
+      language: docker_image
+      types: [file, dockerfile]
+  - repo: https://github.com/sqlfluff/sqlfluff
+    rev: 2.1.1
+    hooks:
+      - id: sqlfluff-lint
+      - id: sqlfluff-fix

app.py CHANGED Viewed

@@ -1,4 +1,333 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

+"""This is a public module. It should have a docstring."""
+import itertools
+import os
+import random
+from typing import Any, List, Tuple
+import openai
 import streamlit as st
+from langchain.agents import AgentExecutor, OpenAIFunctionsAgent
+from langchain.agents.agent_toolkits import create_retriever_tool
+from langchain.agents.openai_functions_agent.agent_token_buffer_memory import (
+    AgentTokenBufferMemory,
+)
+from langchain.callbacks import StreamlitCallbackHandler
+from langchain.chains import QAGenerationChain
+from langchain.chat_models import ChatOpenAI
+from langchain.document_loaders import PyPDFLoader
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.prompts import MessagesPlaceholder
+from langchain.schema import AIMessage, HumanMessage, SystemMessage
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS
+st.set_page_config(page_title="PDF QA", page_icon="📚")
+starter_message = "Ask me anything about the Doc!"
+@st.cache_resource
+def create_prompt(openai_api_key: str) -> Tuple[SystemMessage, ChatOpenAI]:
+    """Create prompt."""
+    try:
+        # Make your OpenAI API request here
+        llm = ChatOpenAI(
+            temperature=0,
+            model_name="gpt-3.5-turbo",
+            streaming=True,
+            openai_api_key=openai_api_key,
+        )
+    except openai.error.AuthenticationError as e:
+        # Handle timeout error, e.g. retry or log
+        print(f"Please check your API key and try again. : {e}")
+        pass
+    message = SystemMessage(
+        content=(
+            "You are a helpful chatbot who is tasked with answering questions about context given through uploaded documents."  # noqa: E501 comment
+            "Unless otherwise explicitly stated, it is probably fair to assume that questions are about the context given."  # noqa: E501 comment
+            "If there is any ambiguity, you probably assume they are about that."  # noqa: E501 comment
+        )
+    )
+    prompt = OpenAIFunctionsAgent.create_prompt(
+        system_message=message,
+        extra_prompt_messages=[MessagesPlaceholder(variable_name="history")],
+    )
+    return prompt, llm
+@st.cache_data
+def save_file_locally(file: Any) -> str:
+    """Save uploaded files locally."""
+    doc_path = os.path.join("tempdir", file.name)
+    with open(doc_path, "wb") as f:
+        f.write(file.getbuffer())
+    return doc_path
+@st.cache_data
+def load_docs(files: List[Any], url: bool = False) -> str:
+    """Load and process the uploaded PDF files."""
+    if not url:
+        st.info("`Reading doc ...`")
+        documents = []
+        for file in files:
+            doc_path = save_file_locally(file)
+            pages = PyPDFLoader(doc_path)
+            documents.extend(pages.load())
+    return ",".join([doc.page_content for doc in documents])
+@st.cache_data
+def gen_embeddings() -> HuggingFaceEmbeddings:
+    """Generate embeddings for given model."""
+    embeddings = HuggingFaceEmbeddings(
+        cache_folder="hf_model"
+    )  # https://github.com/UKPLab/sentence-transformers/issues/1828
+    return embeddings
+@st.cache_resource
+def process_corpus(corpus: str, chunk_size: int = 1000, overlap: int = 50) -> List:
+    """Process text for Semantic Search."""
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=overlap
+    )
+    texts = text_splitter.split_text(corpus)
+    # Display the number of text chunks
+    num_chunks = len(texts)
+    st.write(f"Number of text chunks: {num_chunks}")
+    # select embedding model
+    embeddings = gen_embeddings()
+    # create vectorstore
+    vectorstore = FAISS.from_texts(texts, embeddings).as_retriever(
+        search_kwargs={"k": 4}
+    )
+    # create retriever tool
+    tool = create_retriever_tool(
+        vectorstore,
+        "search_docs",
+        "Searches and returns documents using the context provided as a source, relevant to the user input question.",  # noqa: E501 comment
+    )
+    tools = [tool]
+    return tools
+@st.cache_data
+def generate_agent_executer(text: str) -> List[AgentExecutor]:
+    """Generate the memory functionality."""
+    tools = process_corpus(text)
+    agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt)
+    # Synthwave
+    agent_executor = AgentExecutor(
+        agent=agent,
+        tools=tools,
+        verbose=True,
+        return_intermediate_steps=True,
+    )
+    return agent_executor
+@st.cache_data
+def generate_eval(raw_text: str, N: int, chunk: int) -> List:
+    """Generate the focusing functionality."""
+    # Generate N questions from context of chunk chars
+    # IN: text, N questions, chunk size to draw question from in the doc
+    # OUT: eval set as JSON list
+    # raw_text = ','.join(raw_text)
+    update = st.empty()
+    ques_update = st.empty()
+    update.info("`Generating sample questions ...`")
+    n = len(raw_text)
+    starting_indices = [random.randint(0, n - chunk) for _ in range(N)]
+    sub_sequences = [raw_text[i : i + chunk] for i in starting_indices]
+    chain = QAGenerationChain.from_llm(llm)
+    eval_set = []
+    for i, b in enumerate(sub_sequences):
+        try:
+            qa = chain.run(b)
+            eval_set.append(qa)
+            ques_update.info(f"Creating Question: {i+1}")
+        except ValueError:
+            st.warning(f"Error in generating Question: {i+1}...", icon="⚠️")
+            continue
+    eval_set_full = list(itertools.chain.from_iterable(eval_set))
+    update.empty()
+    ques_update.empty()
+    return eval_set_full
+@st.cache_resource()
+def gen_side_bar_qa(text: str) -> None:
+    """Generate responses from query."""
+    if text:
+        # Check if there are no generated question-answer pairs in the session state
+        if "eval_set" not in st.session_state:
+            # Use the generate_eval function to generate question-answer pairs
+            num_eval_questions = 5  # Number of question-answer pairs to generate
+            st.session_state.eval_set = generate_eval(text, num_eval_questions, 3000)
+        # Display the question-answer pairs in the sidebar with smaller text
+        for i, qa_pair in enumerate(st.session_state.eval_set):
+            st.sidebar.markdown(
+                f"""
+                <div class="css-card">
+                <span class="card-tag">Question {i + 1}</span>
+                    <p style="font-size: 12px;">{qa_pair['question']}</p>
+                    <p style="font-size: 12px;">{qa_pair['answer']}</p>
+                </div>
+                """,
+                unsafe_allow_html=True,
+            )
+        st.write("Ready to answer your questions.")
+# Add custom CSS
+st.markdown(
+    """
+    <style>
+    #MainMenu {visibility: hidden;
+    # }
+        footer {visibility: hidden;
+        }
+        .css-card {
+            border-radius: 0px;
+            padding: 30px 10px 10px 10px;
+            background-color: black;
+            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+            margin-bottom: 10px;
+            font-family: "IBM Plex Sans", sans-serif;
+        }
+        .card-tag {
+            border-radius: 0px;
+            padding: 1px 5px 1px 5px;
+            margin-bottom: 10px;
+            position: absolute;
+            left: 0px;
+            top: 0px;
+            font-size: 0.6rem;
+            font-family: "IBM Plex Sans", sans-serif;
+            color: white;
+            background-color: green;
+            }
+        .css-zt5igj {left:0;
+        }
+        span.css-10trblm {margin-left:0;
+        }
+        div.css-1kyxreq {margin-top: -40px;
+        }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+st.write(
+    """
+<div style="display: flex; align-items: center; margin-left: 0;">
+    <h1 style="display: inline-block;">PDF GPT</h1>
+    <sup style="margin-left:5px;font-size:small; color: green;">beta</sup>
+</div>
+""",
+    unsafe_allow_html=True,
+)
+# Build sidebar
+with st.sidebar:
+    openai_api_key = st.text_input(
+        "OpenAI API Key", key="api_key_openai", type="password"
+    )
+    if openai_api_key and openai_api_key.startswith("sk-"):
+        prompt, llm = create_prompt(openai_api_key)
+        memory = AgentTokenBufferMemory(llm=llm)
+        "[here OpenAI API key](https://platform.openai.com/account/api-keys)"
+    else:
+        st.info("Please add your correct OpenAI API key in the sidebar.")
+# If there's no OpenAI API key, show a message and stop the app for rendering further
+if not openai_api_key:
+    st.info("Please add your OpenAI API key in the sidebar.")
+    st.stop()
+# Use RecursiveCharacterTextSplitter as the default and only text splitter
+splitter_type = "RecursiveCharacterTextSplitter"
+uploaded_files = st.file_uploader(
+    "Upload a PDF Document", type=["pdf"], accept_multiple_files=True
+)
+if uploaded_files:
+    # Check if last_uploaded_files is not in session_state or
+    # if uploaded_files are different from last_uploaded_files
+    if (
+        "last_uploaded_files" not in st.session_state
+        or st.session_state.last_uploaded_files != uploaded_files
+    ):
+        st.session_state.last_uploaded_files = uploaded_files
+        if "eval_set" in st.session_state:
+            del st.session_state["eval_set"]
+    # Load and process the uploaded PDF or TXT files.
+    raw_pdf_text = load_docs(uploaded_files)
+    st.success("Documents uploaded and processed.")
+    # # Question and answering
+    # user_question = st.text_input("Enter your question:")
+    # embeddings = gen_embeddings()
+    # gen_side_bar_qa(raw_pdf_text)
+    # memory, agent_executor = generate_memory_agent_executre(raw_pdf_text)
+    agent_executor = generate_agent_executer(raw_pdf_text)
+if "messages" not in st.session_state or st.sidebar.button("Clear message history"):
+    st.session_state["messages"] = [AIMessage(content=starter_message)]
+for msg in st.session_state.messages:
+    if isinstance(msg, AIMessage):
+        st.chat_message("assistant").write(msg.content)
+    elif isinstance(msg, HumanMessage):
+        st.chat_message("user").write(msg.content)
+    memory.chat_memory.add_message(msg)
+if user_question := st.chat_input(placeholder=starter_message):
+    st.chat_message("user").write(user_question)
+    with st.chat_message("assistant"):
+        st_callback = StreamlitCallbackHandler(
+            st.container(),
+            expand_new_thoughts=True,
+            collapse_completed_thoughts=True,
+            thought_labeler=None,
+        )
+        response = agent_executor(
+            {"input": user_question, "history": st.session_state.messages},
+            callbacks=[st_callback],
+            include_run_info=True,
+        )
+        st.session_state.messages.append(AIMessage(content=response["output"]))
+        st.write(response["output"])
+        memory.save_context({"input": user_question}, response)
+        st.session_state["messages"] = memory.buffer
+        run_id = response["__run"].run_id
+        col_blank, col_text, col1, col2 = st.columns([10, 2, 1, 1])