pdf2AIextract

Sleeping

App Files Files Community

ShayanRl commited on Jul 22, 2024

Commit

ea4451e

verified ·

1 Parent(s): 6cfecea

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -11

app.py CHANGED Viewed

@@ -1,14 +1,32 @@
 import streamlit as st
 import io
 import requests
-from docquery import document, pipeline
-p = pipeline('document-question-answering')
 def fextractURL(pdf_path):
@@ -19,12 +37,36 @@ def fextractURL(pdf_path):
             # If the URL ends with .pdf, use pdfplumber directly
             r = requests.get(pdf_path)
             f = io.BytesIO(r.content)
-            doc = document.load_document(f)
-            for q in ["What is the 2022 net income?", "What is the 2023 net income ?"]:
-                extracted_data+= (q, p(question=q, **doc.context))
     except Exception as e:
-        st.error(f"An error o0000ccurred: {str(e)}")
     return extracted_data
@@ -34,13 +76,28 @@ st.markdown(vert_space, unsafe_allow_html=True)
 st.write("Extract full text from PDF URL")
 pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
 button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
 extractedText = st.empty()
 if button:
     try:
         text = fextractURL(pdfURL)
-        print(text)
-        extractedText.text(text)
     except Exception as e:
-        st.error(f"An error occurrrrred: {str(e)}")

 import streamlit as st
 import io
 import requests
+import pdfplumber
+import os
+from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
+from groq import Groq
+client = Groq(
+    api_key=os.getenv("groq_token"),
+)
+def AImodel(text,question):
+    chat_completion = client.chat.completions.create(
+    messages=[
+        {
+            "role": "user",
+            "content": f"extract {question} in this text:{text}",
+        }, {
+                "role": "system",
+                "content": "You are a helpful questioning/awnsering AI. Provide the bullet points exact answer from the provided text and do not generate new text."
+            },
+    ],
+    model="llama3-groq-8b-8192-tool-use-preview",
+)
+    return chat_completion.choices[0].message.content
 def fextractURL(pdf_path):
             # If the URL ends with .pdf, use pdfplumber directly
             r = requests.get(pdf_path)
             f = io.BytesIO(r.content)
+            with pdfplumber.open(f) as pdf:
+                for page in pdf.pages:
+                    extracted_data += page.extract_text() + "\n"  # Extract text
+                    tables = page.extract_tables()  # Extract tables
+                    for table in tables:
+                        for row in table:
+                            extracted_data += "\t".join(str(cell) for cell in row) + "\n"
+        else:
+            # If the URL does not end with .pdf, download the PDF first
+            response = requests.get(pdf_path)
+            pdf_content = response.content
+            # Save the PDF locally
+            pdf_filename = 'downloaded_document.pdf'
+            with open(pdf_filename, 'wb') as pdf_file:
+                pdf_file.write(pdf_content)
+            # Extract content using pdfplumber
+            with pdfplumber.open(pdf_filename) as pdf:
+                for page in pdf.pages:
+                    extracted_data += page.extract_text() + "\n"  # Extract text
+                    tables = page.extract_tables()  # Extract tables
+                    for table in tables:
+                        for row in table:
+                            extracted_data += "\t".join(str(cell) for cell in row) + "\n"
+            # Delete the PDF file
+            os.remove(pdf_filename)
     except Exception as e:
+        st.error(f"An error occurred: {str(e)}")
     return extracted_data
 st.write("Extract full text from PDF URL")
 pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
+questionText = st.text_input(label="prompt", value="", max_chars=None, key=None, type="default", help="""You can make your prompt very specific to get the desired output. For example, if you need an invoice number, you can format your prompt like this:
+Example Prompt:
+*Extract items with the following details:
+Invoice Number: xxxx
+Date: [Insert Date format]
+Customer Name:
+Total Amount:
+By providing clear and detailed information in your prompt, you'll receive an accurate and tailored response.""", autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
 button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
 extractedText = st.empty()
 if button:
     try:
         text = fextractURL(pdfURL)
+        AItext = AImodel(text,questionText)
+        extractedText.text(AItext)
     except Exception as e:
+        st.error(f"An error occurred: {str(e)}")