Spaces:

Annelisseishere
/

Streamlit_GPT

Sleeping

App Files Files Community

Streamlit_GPT / app.py

Annelisseishere

first commit

9ac353e almost 3 years ago

raw

history blame contribute delete

4.76 kB

	from dotenv import load_dotenv
	import os
	import streamlit as st
	from PyPDF2 import PdfFileReader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.chains.question_answering import load_qa_chain
	from langchain.llms import OpenAI as LLMSOpenAI
	from langchain.llms import AzureOpenAI
	from langchain.callbacks import get_openai_callback
	from langchain.chat_models import ChatOpenAI
	from docx import Document
	from openpyxl import load_workbook
	import pdfplumber


	def extract_text_from_pdf(pdf_file):
	with pdfplumber.open(pdf_file) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text()
	return text


	def extract_text_from_docx(docx_file):
	doc = Document(docx_file)
	paragraphs = [paragraph.text for paragraph in doc.paragraphs]
	return "\n".join(paragraphs)


	def extract_text_from_excel(excel_file):
	workbook = load_workbook(excel_file)
	text = ""
	for sheet in workbook.sheetnames:
	worksheet = workbook[sheet]
	for row in worksheet.iter_rows():
	for cell in row:
	if cell.value:
	text += str(cell.value) + "\n"
	return text


	def split_text_into_chunks(text):
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)
	return text_splitter.split_text(text)


	def create_knowledge_base(chunks, api_key=None):
	embeddings = OpenAIEmbeddings(openai_api_key=api_key)
	knowledge_base = FAISS.from_texts(chunks, embeddings)
	return knowledge_base


	def answer_question(question, knowledge_base, model):
	docs = knowledge_base.similarity_search(question)
	llm = model(model_name="gpt-3.5-turbo", openai_api_key=st.session_state.api_key)
	chain = load_qa_chain(llm, chain_type="stuff")
	with get_openai_callback() as cb:
	response = chain.run(input_documents=docs, question=question)
	return response


	def save_api_key(api_key):
	st.session_state.api_key = api_key


	def main():
	load_dotenv()
	st.set_page_config(page_title="Ask Your PDF", layout="wide")

	# Sidebar
	st.sidebar.title("Settings")

	# API Key input
	st.sidebar.subheader("API Key")
	api_key = st.sidebar.text_input("Insert your API Key", type="password")
	st.sidebar.button("Save API Key", on_click=save_api_key, args=(api_key,))

	model_type = st.sidebar.selectbox("Select Language Model", ["OpenAI", "AzureOpenAI"])
	if model_type == "AzureOpenAI":
	model = AzureOpenAI
	else:
	model = ChatOpenAI

	chunk_size = st.sidebar.slider("Chunk Size", min_value=500, max_value=2000, value=1000, step=100)
	chunk_overlap = st.sidebar.slider("Chunk Overlap", min_value=100, max_value=500, value=200, step=50)
	show_content = st.sidebar.checkbox("Show Document Content")
	show_answers = st.sidebar.checkbox("Show Previous Answers")

	# Main content
	st.title("Ask Your Document 💭")
	file_format = st.selectbox("Select File Format", ["PDF", "docx", "xlsx"])
	document = st.file_uploader("Upload Document", type=[file_format.lower()])

	if not hasattr(st.session_state, "api_key") or not st.session_state.api_key:
	st.warning("You need to insert your API Key first.")
	elif document is not None:
	if file_format == "PDF":
	text = extract_text_from_pdf(document)
	elif file_format == "docx":
	text = extract_text_from_docx(document)
	elif file_format == "xlsx":
	text = extract_text_from_excel(document)
	else:
	text = ""

	if show_content:
	st.subheader("Document Text:")
	st.text_area("Content", value=text, height=300)

	chunks = split_text_into_chunks(text)
	knowledge_base = create_knowledge_base(chunks, api_key=st.session_state.api_key)

	user_question = st.text_input("Ask a question based on the document content:")

	if user_question:
	response = answer_question(user_question, knowledge_base, model)
	st.subheader("Answer:")
	st.write(response)

	# Store and display previous answers
	if "answers" not in st.session_state:
	st.session_state.answers = []
	st.session_state.answers.append((user_question, response))

	if show_answers:
	st.subheader("Previous Answers:")
	for question, answer in st.session_state.answers:
	st.write(f"Question: {question}")
	st.write(f"Answer: {answer}")
	st.write("------")


	if __name__ == '__main__':
	main()