Alimustoofaa commited on
Commit
5ddf6ee
·
1 Parent(s): 598ae31

Add application file

Browse files
Files changed (3) hide show
  1. main.py +45 -0
  2. process.py +54 -0
  3. requirements.txt +10 -0
main.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ import base64
3
+ import streamlit as st
4
+ from pathlib import Path
5
+ import process
6
+
7
+ st.title("PDF Question Answer")
8
+
9
+ # check directory
10
+ SAVE_FOLDER = './assets'
11
+ pathlib.Path(SAVE_FOLDER).mkdir(parents=True, exist_ok=True)
12
+
13
+ def display_pdf(file):
14
+ with open(file, "rb") as f:
15
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
16
+ pdf_display = F'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="700" type="application/pdf"></iframe>'
17
+ st.markdown(pdf_display, unsafe_allow_html=True)
18
+
19
+ # Input file upload
20
+ pdf_uploaded = st.file_uploader(label = "Upload PDF File", type=["pdf"])
21
+
22
+ if pdf_uploaded:
23
+ save_path = Path(SAVE_FOLDER, pdf_uploaded.name)
24
+ with open(save_path, mode='wb') as w:
25
+ w.write(pdf_uploaded.getvalue())
26
+
27
+ if save_path.exists():
28
+ st.success(f'File {pdf_uploaded.name} is successfully saved!')
29
+ display_pdf(save_path)
30
+
31
+ # Display input Question
32
+ st.markdown("**Please fill the below form :**")
33
+ with st.form(key="Form :", clear_on_submit = False):
34
+ question_input = st.text_input('Question : ')
35
+ token_openai_input = st.text_input('Token OpenAI : ')
36
+ submit_btn = st.form_submit_button(label='Submit')
37
+
38
+ if submit_btn:
39
+ answer = process.main_process(
40
+ pdf_path=f'{SAVE_FOLDER}/{pdf_uploaded.name}',
41
+ question= question_input,
42
+ openai_key=token_openai_input
43
+ )
44
+ st.markdown("Answer : ")
45
+ st.markdown(answer)
process.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Import langchain lib
4
+ from langchain.llms import OpenAI
5
+ from langchain.chains import RetrievalQA
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.embeddings.openai import OpenAIEmbeddings
8
+ from langchain.text_splitter import CharacterTextSplitter
9
+ from langchain.document_loaders import UnstructuredPDFLoader
10
+ from langchain.chains.question_answering import load_qa_chain
11
+
12
+ def load_pdf(pdf_path):
13
+ loader = UnstructuredPDFLoader(pdf_path)
14
+ pages = loader.load()
15
+ return pages
16
+
17
+ def update_openai_key(openai_key):
18
+ os.environ['OPENAI_API_KEY'] = openai_key
19
+
20
+ def texts_splitter(pages):
21
+ text_splitter = CharacterTextSplitter(chunk_size=3000, chunk_overlap=20)
22
+ texts = text_splitter.split_documents(pages)
23
+ return texts
24
+
25
+ def qa_langchain(docsearch):
26
+ qa_chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
27
+ qa = RetrievalQA(combine_documents_chain=qa_chain, retriever=docsearch.as_retriever())
28
+ return qa
29
+
30
+ def main_process(pdf_path, question, openai_key):
31
+ # Update OpenAI key
32
+ update_openai_key(openai_key)
33
+ # load PDF
34
+ pages = load_pdf(pdf_path)
35
+ # Text splitter
36
+ texts = texts_splitter(pages)
37
+
38
+ # define embeddings
39
+ embeddings = OpenAIEmbeddings()
40
+ # print(embeddings)
41
+ docsearch = Chroma.from_documents(texts, embeddings)
42
+
43
+ qa = qa_langchain(docsearch)
44
+ answer = qa.run(question)
45
+ return answer
46
+
47
+
48
+ if __name__ == "__main__":
49
+ answer = main_process(
50
+ pdf_path='assets/599-Article Text-5382-1-10-20230603.pdf',
51
+ question= 'Keberadaan mahkamah partai untuk',
52
+ openai_key='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'
53
+ )
54
+ print(answer)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ openai
3
+ pypdf
4
+ langchain
5
+ unstructured
6
+ chromadb
7
+ tiktoken
8
+ pdf2image
9
+ urllib3==1.26.6
10
+ tabulate