Spaces:

Nirav-Khanpara
/

Query-PDF

Build error

App Files Files Community

Update app.py

by micbon - opened Dec 19, 2023

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

-10

Files changed (1) hide show

app.py +9 -10

app.py CHANGED Viewed

@@ -14,27 +14,27 @@ from langchain.document_loaders import PyPDFLoader
 from langchain.vectorstores import FAISS
 from langchain.docstore.document import Document
-llm = GooglePalm(temperature=0.9)
 st.title("PDF Query Tool")
-st.write("Upload your PDF and ask question from it")
 uploaded_file = st.file_uploader("Choose a PDF file")
 main_placeholder = st.empty()
 second_placeholder = st.empty()
 if uploaded_file:
     filename = uploaded_file.name
     if not filename.endswith(('.pdf', '.PDF')):
-        main_placeholder.warning("Choose PDF Document !!!")
         exit()
     elif not os.path.exists(uploaded_file.name):
         main_placeholder.text("Data Loading Started...⌛⌛⌛")
         with open(f'{uploaded_file.name}', 'wb') as f:
             f.write(uploaded_file.getbuffer())
-        pdf_loader = PyPDFLoader(uploaded_file.name)
         documents = pdf_loader.load()
         raw_text = ''
@@ -42,19 +42,20 @@ if uploaded_file:
             raw_text += doc.page_content
         if len(raw_text) < 10:
-            main_placeholder.text("It looks like Scanned PDF, No worries converting it...⌛⌛⌛")
             raw_text = get_text_from_scanned_pdf(uploaded_file.name)
         main_placeholder.text("Splitting text into smaller chunks...⌛⌛⌛")
         text_splitter = RecursiveCharacterTextSplitter(
-            separators=['\n\n', '\n', '.', ','],
             chunk_size=2000
         )
         texts = text_splitter.split_text(raw_text)
         docs = [Document(page_content=t) for t in texts]
-        embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
         main_placeholder.text("Storing data into Vector Database...⌛⌛⌛")
         vectorstore = FAISS.from_documents(docs, embeddings)
@@ -64,7 +65,6 @@ if uploaded_file:
     main_placeholder.text("Data Loading Completed...✅✅✅")
 query = second_placeholder.text_input("Question:")
 if query:
     if os.path.exists(f'vector_store_{uploaded_file.name}.pkl'):
@@ -93,4 +93,3 @@ if query:
             result = chain({"query": query})
         st.header("Answer")
         st.write(result["result"])

 from langchain.vectorstores import FAISS
 from langchain.docstore.document import Document
+# Update the language model to support Italian
+llm = GooglePalm(temperature=0.9, lang="it")
 st.title("PDF Query Tool")
+st.write("Upload your PDF and ask questions from it")
 uploaded_file = st.file_uploader("Choose a PDF file")
 main_placeholder = st.empty()
 second_placeholder = st.empty()
 if uploaded_file:
     filename = uploaded_file.name
     if not filename.endswith(('.pdf', '.PDF')):
+        main_placeholder.warning("Choose a PDF Document !!!")
         exit()
     elif not os.path.exists(uploaded_file.name):
         main_placeholder.text("Data Loading Started...⌛⌛⌛")
         with open(f'{uploaded_file.name}', 'wb') as f:
             f.write(uploaded_file.getbuffer())
+        pdf_loader = PyPDFLoader(uploaded_file.name, lang="it")  # Specify Italian language
         documents = pdf_loader.load()
         raw_text = ''
             raw_text += doc.page_content
         if len(raw_text) < 10:
+            main_placeholder.text("It looks like a Scanned PDF, converting it...⌛⌛⌛")
             raw_text = get_text_from_scanned_pdf(uploaded_file.name)
         main_placeholder.text("Splitting text into smaller chunks...⌛⌛⌛")
+        # Update the text splitting logic to handle Italian
         text_splitter = RecursiveCharacterTextSplitter(
+            separators=['\n\n', '\n', '.', ',', '!', '?'],  # Add Italian punctuation
             chunk_size=2000
         )
         texts = text_splitter.split_text(raw_text)
         docs = [Document(page_content=t) for t in texts]
+        embeddings = HuggingFaceInstructEmbeddings(model_name="Helsinki-NLP/opus-mt-it-en")
         main_placeholder.text("Storing data into Vector Database...⌛⌛⌛")
         vectorstore = FAISS.from_documents(docs, embeddings)
     main_placeholder.text("Data Loading Completed...✅✅✅")
 query = second_placeholder.text_input("Question:")
 if query:
     if os.path.exists(f'vector_store_{uploaded_file.name}.pkl'):
             result = chain({"query": query})
         st.header("Answer")
         st.write(result["result"])