Spaces:

agutfraind
/

llmscanner

Runtime error

App Files Files Community

llmscanner / app.py

agutfraind

uploading docs

7baa084 over 2 years ago

raw

history blame contribute delete

8.04 kB

	'''
	LLM scanner streamlit app

	streamlit run .\app.py

	Functionality
	- tokenize documents
	- respond to queries
	- generate new documents

	Based on:
	1. https://huggingface.co/spaces/llamaindex/llama_index_vector_demo
	2. https://github.com/logan-markewich/llama_index_starter_pack/blob/main/streamlit_term_definition/

	TODO:
	- customize to other [LLMs](https://gpt-index.readthedocs.io/en/latest/reference/llm_predictor.html#llama_index.llm_predictor.LLMPredictor)
	- guardrails on
	- prevent answers on facts outside the document (e.g. birthdate of Michael Jordan in the docs vs. the baseball player)
	'''

	import os
	import streamlit as st
	from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, ServiceContext, LLMPredictor, PromptHelper, readers
	from llama_index import StorageContext, load_index_from_storage

	from langchain import OpenAI, HuggingFaceHub

	import app_constants

	index_fpath = "./llamas_index"
	documents_folder = "./documents" #initial documents - additional can be added via upload

	if "dummy" not in st.session_state:
	st.session_state["dummy"] = "dummy"

	#@st.cache_resource #st makes this globally available for all users and sessions
	def initialize_index(index_name, documents_folder, persisted_to_storage=True):
	"""
	creates an index of the documents in the folder
	if the index exists, skipped
	"""
	# set maximum input size
	max_input_size = 4096
	# set number of output tokens
	num_outputs = 2000
	# set maximum chunk overlap
	max_chunk_overlap = 20
	# set chunk size limit
	chunk_size_limit = 600

	llm_predictor = LLMPredictor(llm=OpenAI(openai_api_key=api_key, #from env
	temperature=0.5,
	model_name="text-davinci-003",
	max_tokens=num_outputs))
	#wishlist: alternatives
	service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
	if os.path.exists(index_name):
	storage_context = StorageContext.from_defaults(persist_dir=index_fpath)
	doc_index = load_index_from_storage(service_context=service_context, storage_context=storage_context)
	else:
	#st.info("Updating the document index")
	prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

	documents = SimpleDirectoryReader(documents_folder).load_data()
	doc_index = GPTVectorStoreIndex.from_documents(
	documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper,
	chunk_size_limit=512, service_context=service_context
	)
	if persisted_to_storage:
	doc_index.storage_context.persist(index_fpath)

	#avoid this side-effect: st.session_state["doc_index"] = "doc_index"
	return doc_index

	#st returns data that's available for future caller
	@st.cache_data(max_entries=200, persist=True)
	def query_index(_index, query_text):
	query_engine = _index.as_query_engine()
	response = query_engine.query(query_text)
	#response = _index.query(query_text)
	return str(response)


	#page format is directly written her
	st.title("LLM scanner")
	st.markdown(
	(
	"This app allows you to query documents!\n\n"
	"Powered by [Llama Index](https://gpt-index.readthedocs.io/en/latest/index.html)"
	)
	)

	setup_tab, upload_tab, query_tab = st.tabs(
	["Setup", "Index", "Query"]
	)

	with setup_tab:
	st.subheader("LLM Setup")
	api_key = st.text_input("Enter your OpenAI API key here", type="password")

	#wishlist llm_name = st.selectbox(
	# "Which LLM?", ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
	#)
	#repo_id = "google/flan-t5-xl" # See https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads for some other options
	#llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature":0, "max_length":64})

	#model_temperature = st.slider(
	# "LLM Temperature", min_value=0.0, max_value=1.0, step=0.1
	#)

	if api_key is not None and "doc_index" not in st.session_state:
	st.session_state["doc_index"] = initialize_index(index_fpath, documents_folder, persisted_to_storage=False)


	with upload_tab:
	st.subheader("Upload documents")

	if st.button("Re-initialize index with pre-packaged documents"):
	st.session_state["doc_index"] = initialize_index(index_fpath, documents_folder, persisted_to_storage=False)
	st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))

	if "doc_index" in st.session_state:
	doc_index = st.session_state["doc_index"]
	st.markdown(
	"Either upload a document, or enter the text manually."
	)
	uploaded_file = st.file_uploader(
	"Upload a document (pdf):", type=["pdf"]
	)
	document_text = st.text_area("Enter text")
	if st.button("Add document to index") and (uploaded_file or document_text):
	with st.spinner("Inserting (large files may be slow)..."):
	if document_text:
	doc_index.refresh([readers.Document(text=document_text)]) #tokenizes new documents
	st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))

	st.session_state["doc_index"] = doc_index
	if uploaded_file:
	uploads_folder = "uploads/"
	if not os.path.exists(uploads_folder):
	os.mkdir(uploads_folder)
	#file_details = {"FileName":uploaded_file.name,"FileType":uploaded_file.type}
	with open(uploads_folder + "tmp.pdf", "wb") as f:
	f.write(uploaded_file.getbuffer())
	documents = SimpleDirectoryReader(uploads_folder).load_data()
	doc_index.refresh(documents) #tokenizes new documents
	st.session_state["doc_index"] = doc_index
	st.info('Documents in index: ' + str(st.session_state["doc_index"].docstore.docs.__len__()))

	st.session_state["doc_index"] = doc_index
	os.remove(uploads_folder + "tmp.pdf")

	with query_tab:
	st.subheader("Query Tab")
	st.write("Enter a query about the included documents. Find [documentation here](https://huggingface.co/spaces/agutfraind/llmscanner)")

	doc_index = None
	#api_key = st.text_input("Enter your OpenAI API key here:", type="password")
	if api_key:
	os.environ['OPENAI_API_KEY'] = api_key
	#doc_index = initialize_index(index_fpath, documents_folder)

	if doc_index is None:
	if "doc_index" in st.session_state:
	doc_index = st.session_state["doc_index"]
	st.info('Documents in index: ' + str(doc_index.docstore.docs.__len__()))
	else:
	st.warning("Doc index is not available - initialize or upload")
	#st.warning("Please enter your api key first.")

	if doc_index and api_key:
	select_type_your_own = 'type your own...'
	options_for_queries = app_constants.canned_questions + [select_type_your_own]
	query_selection = st.selectbox("Select option", options=options_for_queries)
	query_text = None

	if query_selection == select_type_your_own:
	query_text = st.text_input("Query text")
	else:
	query_text = query_selection

	if st.button("Run Query") and (doc_index is not None) and (query_text is not None):
	response = query_index(doc_index, query_text)
	st.markdown(response)

	llm_col, embed_col = st.columns(2)
	with llm_col:
	st.markdown(f"LLM Tokens Used: {doc_index.service_context.llm_predictor._last_token_usage}")

	with embed_col:
	st.markdown(f"Embedding Tokens Used: {doc_index.service_context.embed_model._last_token_usage}")