Update app.py

#1
by micbon - opened
Files changed (1) hide show
  1. app.py +9 -10
app.py CHANGED
@@ -14,27 +14,27 @@ from langchain.document_loaders import PyPDFLoader
14
  from langchain.vectorstores import FAISS
15
  from langchain.docstore.document import Document
16
 
17
- llm = GooglePalm(temperature=0.9)
 
18
 
19
  st.title("PDF Query Tool")
20
- st.write("Upload your PDF and ask question from it")
21
 
22
  uploaded_file = st.file_uploader("Choose a PDF file")
23
  main_placeholder = st.empty()
24
  second_placeholder = st.empty()
25
 
26
-
27
  if uploaded_file:
28
  filename = uploaded_file.name
29
  if not filename.endswith(('.pdf', '.PDF')):
30
- main_placeholder.warning("Choose PDF Document !!!")
31
  exit()
32
  elif not os.path.exists(uploaded_file.name):
33
  main_placeholder.text("Data Loading Started...βŒ›βŒ›βŒ›")
34
  with open(f'{uploaded_file.name}', 'wb') as f:
35
  f.write(uploaded_file.getbuffer())
36
 
37
- pdf_loader = PyPDFLoader(uploaded_file.name)
38
  documents = pdf_loader.load()
39
 
40
  raw_text = ''
@@ -42,19 +42,20 @@ if uploaded_file:
42
  raw_text += doc.page_content
43
 
44
  if len(raw_text) < 10:
45
- main_placeholder.text("It looks like Scanned PDF, No worries converting it...βŒ›βŒ›βŒ›")
46
  raw_text = get_text_from_scanned_pdf(uploaded_file.name)
47
 
48
  main_placeholder.text("Splitting text into smaller chunks...βŒ›βŒ›βŒ›")
 
49
  text_splitter = RecursiveCharacterTextSplitter(
50
- separators=['\n\n', '\n', '.', ','],
51
  chunk_size=2000
52
  )
53
 
54
  texts = text_splitter.split_text(raw_text)
55
  docs = [Document(page_content=t) for t in texts]
56
 
57
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
58
  main_placeholder.text("Storing data into Vector Database...βŒ›βŒ›βŒ›")
59
  vectorstore = FAISS.from_documents(docs, embeddings)
60
 
@@ -64,7 +65,6 @@ if uploaded_file:
64
 
65
  main_placeholder.text("Data Loading Completed...βœ…βœ…βœ…")
66
 
67
-
68
  query = second_placeholder.text_input("Question:")
69
  if query:
70
  if os.path.exists(f'vector_store_{uploaded_file.name}.pkl'):
@@ -93,4 +93,3 @@ if query:
93
  result = chain({"query": query})
94
  st.header("Answer")
95
  st.write(result["result"])
96
-
 
14
  from langchain.vectorstores import FAISS
15
  from langchain.docstore.document import Document
16
 
17
+ # Update the language model to support Italian
18
+ llm = GooglePalm(temperature=0.9, lang="it")
19
 
20
  st.title("PDF Query Tool")
21
+ st.write("Upload your PDF and ask questions from it")
22
 
23
  uploaded_file = st.file_uploader("Choose a PDF file")
24
  main_placeholder = st.empty()
25
  second_placeholder = st.empty()
26
 
 
27
  if uploaded_file:
28
  filename = uploaded_file.name
29
  if not filename.endswith(('.pdf', '.PDF')):
30
+ main_placeholder.warning("Choose a PDF Document !!!")
31
  exit()
32
  elif not os.path.exists(uploaded_file.name):
33
  main_placeholder.text("Data Loading Started...βŒ›βŒ›βŒ›")
34
  with open(f'{uploaded_file.name}', 'wb') as f:
35
  f.write(uploaded_file.getbuffer())
36
 
37
+ pdf_loader = PyPDFLoader(uploaded_file.name, lang="it") # Specify Italian language
38
  documents = pdf_loader.load()
39
 
40
  raw_text = ''
 
42
  raw_text += doc.page_content
43
 
44
  if len(raw_text) < 10:
45
+ main_placeholder.text("It looks like a Scanned PDF, converting it...βŒ›βŒ›βŒ›")
46
  raw_text = get_text_from_scanned_pdf(uploaded_file.name)
47
 
48
  main_placeholder.text("Splitting text into smaller chunks...βŒ›βŒ›βŒ›")
49
+ # Update the text splitting logic to handle Italian
50
  text_splitter = RecursiveCharacterTextSplitter(
51
+ separators=['\n\n', '\n', '.', ',', '!', '?'], # Add Italian punctuation
52
  chunk_size=2000
53
  )
54
 
55
  texts = text_splitter.split_text(raw_text)
56
  docs = [Document(page_content=t) for t in texts]
57
 
58
+ embeddings = HuggingFaceInstructEmbeddings(model_name="Helsinki-NLP/opus-mt-it-en")
59
  main_placeholder.text("Storing data into Vector Database...βŒ›βŒ›βŒ›")
60
  vectorstore = FAISS.from_documents(docs, embeddings)
61
 
 
65
 
66
  main_placeholder.text("Data Loading Completed...βœ…βœ…βœ…")
67
 
 
68
  query = second_placeholder.text_input("Question:")
69
  if query:
70
  if os.path.exists(f'vector_store_{uploaded_file.name}.pkl'):
 
93
  result = chain({"query": query})
94
  st.header("Answer")
95
  st.write(result["result"])