Spaces:
Runtime error
Runtime error
Commit
·
5ddf6ee
1
Parent(s):
598ae31
Add application file
Browse files- main.py +45 -0
- process.py +54 -0
- requirements.txt +10 -0
main.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pathlib
|
| 2 |
+
import base64
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import process
|
| 6 |
+
|
| 7 |
+
st.title("PDF Question Answer")
|
| 8 |
+
|
| 9 |
+
# check directory
|
| 10 |
+
SAVE_FOLDER = './assets'
|
| 11 |
+
pathlib.Path(SAVE_FOLDER).mkdir(parents=True, exist_ok=True)
|
| 12 |
+
|
| 13 |
+
def display_pdf(file):
|
| 14 |
+
with open(file, "rb") as f:
|
| 15 |
+
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
| 16 |
+
pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="700" type="application/pdf"></iframe>'
|
| 17 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
| 18 |
+
|
| 19 |
+
# Input file upload
|
| 20 |
+
pdf_uploaded = st.file_uploader(label = "Upload PDF File", type=["pdf"])
|
| 21 |
+
|
| 22 |
+
if pdf_uploaded:
|
| 23 |
+
save_path = Path(SAVE_FOLDER, pdf_uploaded.name)
|
| 24 |
+
with open(save_path, mode='wb') as w:
|
| 25 |
+
w.write(pdf_uploaded.getvalue())
|
| 26 |
+
|
| 27 |
+
if save_path.exists():
|
| 28 |
+
st.success(f'File {pdf_uploaded.name} is successfully saved!')
|
| 29 |
+
display_pdf(save_path)
|
| 30 |
+
|
| 31 |
+
# Display input Question
|
| 32 |
+
st.markdown("**Please fill the below form :**")
|
| 33 |
+
with st.form(key="Form :", clear_on_submit = False):
|
| 34 |
+
question_input = st.text_input('Question : ')
|
| 35 |
+
token_openai_input = st.text_input('Token OpenAI : ')
|
| 36 |
+
submit_btn = st.form_submit_button(label='Submit')
|
| 37 |
+
|
| 38 |
+
if submit_btn:
|
| 39 |
+
answer = process.main_process(
|
| 40 |
+
pdf_path=f'{SAVE_FOLDER}/{pdf_uploaded.name}',
|
| 41 |
+
question= question_input,
|
| 42 |
+
openai_key=token_openai_input
|
| 43 |
+
)
|
| 44 |
+
st.markdown("Answer : ")
|
| 45 |
+
st.markdown(answer)
|
process.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
# Import langchain lib
|
| 4 |
+
from langchain.llms import OpenAI
|
| 5 |
+
from langchain.chains import RetrievalQA
|
| 6 |
+
from langchain.vectorstores import Chroma
|
| 7 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 8 |
+
from langchain.text_splitter import CharacterTextSplitter
|
| 9 |
+
from langchain.document_loaders import UnstructuredPDFLoader
|
| 10 |
+
from langchain.chains.question_answering import load_qa_chain
|
| 11 |
+
|
| 12 |
+
def load_pdf(pdf_path):
|
| 13 |
+
loader = UnstructuredPDFLoader(pdf_path)
|
| 14 |
+
pages = loader.load()
|
| 15 |
+
return pages
|
| 16 |
+
|
| 17 |
+
def update_openai_key(openai_key):
|
| 18 |
+
os.environ['OPENAI_API_KEY'] = openai_key
|
| 19 |
+
|
| 20 |
+
def texts_splitter(pages):
|
| 21 |
+
text_splitter = CharacterTextSplitter(chunk_size=3000, chunk_overlap=20)
|
| 22 |
+
texts = text_splitter.split_documents(pages)
|
| 23 |
+
return texts
|
| 24 |
+
|
| 25 |
+
def qa_langchain(docsearch):
|
| 26 |
+
qa_chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
|
| 27 |
+
qa = RetrievalQA(combine_documents_chain=qa_chain, retriever=docsearch.as_retriever())
|
| 28 |
+
return qa
|
| 29 |
+
|
| 30 |
+
def main_process(pdf_path, question, openai_key):
|
| 31 |
+
# Update OpenAI key
|
| 32 |
+
update_openai_key(openai_key)
|
| 33 |
+
# load PDF
|
| 34 |
+
pages = load_pdf(pdf_path)
|
| 35 |
+
# Text splitter
|
| 36 |
+
texts = texts_splitter(pages)
|
| 37 |
+
|
| 38 |
+
# define embeddings
|
| 39 |
+
embeddings = OpenAIEmbeddings()
|
| 40 |
+
# print(embeddings)
|
| 41 |
+
docsearch = Chroma.from_documents(texts, embeddings)
|
| 42 |
+
|
| 43 |
+
qa = qa_langchain(docsearch)
|
| 44 |
+
answer = qa.run(question)
|
| 45 |
+
return answer
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
if __name__ == "__main__":
|
| 49 |
+
answer = main_process(
|
| 50 |
+
pdf_path='assets/599-Article Text-5382-1-10-20230603.pdf',
|
| 51 |
+
question= 'Keberadaan mahkamah partai untuk',
|
| 52 |
+
openai_key='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'
|
| 53 |
+
)
|
| 54 |
+
print(answer)
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
openai
|
| 3 |
+
pypdf
|
| 4 |
+
langchain
|
| 5 |
+
unstructured
|
| 6 |
+
chromadb
|
| 7 |
+
tiktoken
|
| 8 |
+
pdf2image
|
| 9 |
+
urllib3==1.26.6
|
| 10 |
+
tabulate
|