Spaces:

agutfraind
/

llmscanner

Build error

App Files Files Community

agutfraind commited on May 22, 2023

Commit

7baa084

1 Parent(s): 863df0d

uploading docs

Browse files

Files changed (3) hide show

.streamlit/config.toml +6 -0
app.py +87 -30
app_constants.py +6 -1

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,6 @@

+[theme]
+primaryColor="#F63366"
+backgroundColor="#FFFFFF"
+secondaryBackgroundColor="#F0F2F6"
+textColor="#262730"
+font="sans serif"

app.py CHANGED Viewed

@@ -12,31 +12,29 @@ Based on:
 1. https://huggingface.co/spaces/llamaindex/llama_index_vector_demo
 2. https://github.com/logan-markewich/llama_index_starter_pack/blob/main/streamlit_term_definition/
 TODO:
-- document upload
 - customize to other [LLMs](https://gpt-index.readthedocs.io/en/latest/reference/llm_predictor.html#llama_index.llm_predictor.LLMPredictor)
-- canned questions
 '''
 import os
 import streamlit as st
-from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, ServiceContext, LLMPredictor, PromptHelper
 from llama_index import StorageContext, load_index_from_storage
 from langchain import OpenAI, HuggingFaceHub
 import app_constants
-index_fpath = "./index.json"
-documents_folder = "./documents"
 if "dummy" not in st.session_state:
     st.session_state["dummy"] = "dummy"
-@st.cache_resource  #st makes this globally available for all users and sessions
-def initialize_index(index_name, documents_folder):
     """
     creates an index of the documents in the folder
     if the index exists, skipped
@@ -50,8 +48,10 @@ def initialize_index(index_name, documents_folder):
     # set chunk size limit
     chunk_size_limit = 600
-    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.5, model_name="text-davinci-003", max_tokens=num_outputs))
     #wishlist: alternatives
     service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
     if os.path.exists(index_name):
@@ -66,8 +66,10 @@ def initialize_index(index_name, documents_folder):
             documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper,
             chunk_size_limit=512, service_context=service_context
         )
-        doc_index.storage_context.persist(index_fpath)
     return doc_index
 #st returns data that's available for future caller
@@ -84,17 +86,18 @@ st.title("LLM scanner")
 st.markdown(
     (
         "This app allows you to query documents!\n\n"
-        "Powered by [Llama Index](https://gpt-index.readthedocs.io/en/latest/index.html) and supporting multiple LLMs"
     )
 )
-setup_tab, query_tab = st.tabs(
-    ["Setup", "Query"]
 )
 with setup_tab:
     st.subheader("LLM Setup")
     api_key = st.text_input("Enter your OpenAI API key here", type="password")
     #wishlist llm_name = st.selectbox(
     #    "Which LLM?", ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
     #)
@@ -105,6 +108,47 @@ with setup_tab:
     #    "LLM Temperature", min_value=0.0, max_value=1.0, step=0.1
     #)
 with query_tab:
     st.subheader("Query Tab")
@@ -114,22 +158,35 @@ with query_tab:
     #api_key = st.text_input("Enter your OpenAI API key here:", type="password")
     if api_key:
         os.environ['OPENAI_API_KEY'] = api_key
-        doc_index = initialize_index(index_fpath, documents_folder)
     if doc_index is None:
-        st.warning("Please enter your api key first.")
-    text = st.text_input("Query text:", value="What did the author do growing up?")
-    if st.button("Run Query") and text is not None:
-        response = query_index(doc_index, text)
-        st.markdown(response)
-        llm_col, embed_col = st.columns(2)
-        with llm_col:
-            st.markdown(f"LLM Tokens Used: {doc_index.service_context.llm_predictor._last_token_usage}")
-        with embed_col:
-            st.markdown(f"Embedding Tokens Used: {doc_index.service_context.embed_model._last_token_usage}")

 1. https://huggingface.co/spaces/llamaindex/llama_index_vector_demo
 2. https://github.com/logan-markewich/llama_index_starter_pack/blob/main/streamlit_term_definition/
 TODO:
 - customize to other [LLMs](https://gpt-index.readthedocs.io/en/latest/reference/llm_predictor.html#llama_index.llm_predictor.LLMPredictor)
+- guardrails on
+- prevent answers on facts outside the document (e.g. birthdate of Michael Jordan in the docs vs. the baseball player)
 '''
 import os
 import streamlit as st
+from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, ServiceContext, LLMPredictor, PromptHelper, readers
 from llama_index import StorageContext, load_index_from_storage
 from langchain import OpenAI, HuggingFaceHub
 import app_constants
+index_fpath = "./llamas_index"
+documents_folder = "./documents" #initial documents - additional can be added via upload
 if "dummy" not in st.session_state:
     st.session_state["dummy"] = "dummy"
+#@st.cache_resource  #st makes this globally available for all users and sessions
+def initialize_index(index_name, documents_folder, persisted_to_storage=True):
     """
     creates an index of the documents in the folder
     if the index exists, skipped
     # set chunk size limit
     chunk_size_limit = 600
+    llm_predictor = LLMPredictor(llm=OpenAI(openai_api_key=api_key, #from env
+                                            temperature=0.5,
+                                            model_name="text-davinci-003",
+                                            max_tokens=num_outputs))
     #wishlist: alternatives
     service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
     if os.path.exists(index_name):
             documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper,
             chunk_size_limit=512, service_context=service_context
         )
+        if persisted_to_storage:
+            doc_index.storage_context.persist(index_fpath)
+    #avoid this side-effect: st.session_state["doc_index"] = "doc_index"
     return doc_index
 #st returns data that's available for future caller
 st.markdown(
     (
         "This app allows you to query documents!\n\n"
+        "Powered by [Llama Index](https://gpt-index.readthedocs.io/en/latest/index.html)"
     )
 )
+setup_tab, upload_tab, query_tab = st.tabs(
+    ["Setup", "Index", "Query"]
 )
 with setup_tab:
     st.subheader("LLM Setup")
     api_key = st.text_input("Enter your OpenAI API key here", type="password")
     #wishlist llm_name = st.selectbox(
     #    "Which LLM?", ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
     #)
     #    "LLM Temperature", min_value=0.0, max_value=1.0, step=0.1
     #)
+if api_key is not None and "doc_index" not in st.session_state:
+    st.session_state["doc_index"] = initialize_index(index_fpath, documents_folder, persisted_to_storage=False)
+with upload_tab:
+    st.subheader("Upload documents")
+    if st.button("Re-initialize index with pre-packaged documents"):
+        st.session_state["doc_index"] = initialize_index(index_fpath, documents_folder, persisted_to_storage=False)
+        st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))
+    if "doc_index" in st.session_state:
+        doc_index = st.session_state["doc_index"]
+        st.markdown(
+            "Either upload a document, or enter the text manually."
+        )
+        uploaded_file = st.file_uploader(
+            "Upload a document (pdf):", type=["pdf"]
+        )
+        document_text = st.text_area("Enter text")
+        if st.button("Add document to index") and (uploaded_file or document_text):
+            with st.spinner("Inserting (large files may be slow)..."):
+                if document_text:
+                    doc_index.refresh([readers.Document(text=document_text)]) #tokenizes new documents
+                    st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))
+                    st.session_state["doc_index"] = doc_index
+                if uploaded_file:
+                    uploads_folder = "uploads/"
+                    if not os.path.exists(uploads_folder):
+                        os.mkdir(uploads_folder)
+                    #file_details = {"FileName":uploaded_file.name,"FileType":uploaded_file.type}
+                    with open(uploads_folder + "tmp.pdf", "wb") as f:
+                        f.write(uploaded_file.getbuffer())
+                    documents = SimpleDirectoryReader(uploads_folder).load_data()
+                    doc_index.refresh(documents) #tokenizes new documents
+                    st.session_state["doc_index"] = doc_index
+                    st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))
+                    st.session_state["doc_index"] = doc_index
+                    os.remove(uploads_folder + "tmp.pdf")
 with query_tab:
     st.subheader("Query Tab")
     #api_key = st.text_input("Enter your OpenAI API key here:", type="password")
     if api_key:
         os.environ['OPENAI_API_KEY'] = api_key
+        #doc_index = initialize_index(index_fpath, documents_folder)
     if doc_index is None:
+        if "doc_index" in st.session_state:
+            doc_index = st.session_state["doc_index"]
+            st.info('Documents in index: ' + str(doc_index.docstore.docs.__len__()))
+        else:
+            st.warning("Doc index is not available - initialize or upload")
+        #st.warning("Please enter your api key first.")
+    if doc_index and api_key:
+        select_type_your_own = 'type your own...'
+        options_for_queries = app_constants.canned_questions + [select_type_your_own]
+        query_selection = st.selectbox("Select option", options=options_for_queries)
+        query_text = None
+        if query_selection == select_type_your_own:
+            query_text = st.text_input("Query text")
+        else:
+            query_text = query_selection
+        if st.button("Run Query") and (doc_index is not None) and (query_text is not None):
+            response = query_index(doc_index, query_text)
+            st.markdown(response)
+            llm_col, embed_col = st.columns(2)
+            with llm_col:
+                st.markdown(f"LLM Tokens Used: {doc_index.service_context.llm_predictor._last_token_usage}")
+            with embed_col:
+                st.markdown(f"Embedding Tokens Used: {doc_index.service_context.embed_model._last_token_usage}")

app_constants.py CHANGED Viewed

@@ -3,4 +3,9 @@ file for
 - canned prompts
 - constants (other than secrets)
-'''

 - canned prompts
 - constants (other than secrets)
+'''
+canned_questions = [
+    "When was Paul Graham born?",
+    "What was his first startup?"
+]