pdf2AIextract

Sleeping

File size: 3,735 Bytes

cadae78
 
 
ea4451e
 
 
f733827
ea4451e
 
 
1d0c7b7
16a77db
 
ea4451e
 
 
 
 
 
 
 
1907239
ea4451e
 
 
 
 
440b73f
cadae78
 
 
c45f030
ffd8879
 
 
 
 
ea4451e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffd8879
ea4451e
ffd8879
cadae78
 
 
a673cfb
6ef59e8
ffd8879
cadae78
ffd8879
ea4451e
 
 
 
 
 
 
 
 
 
cadae78
 
 
 
 
 
ea4451e
 
 
cadae78
ea4451e

import streamlit as st
import io
import requests
import pdfplumber
import os
from groq import Groq

client = Groq(
    api_key=os.getenv("groq_token"),
)



def AImodel(text,question):
    chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": f"extract {question} in this text:{text}",
        }, {
                "role": "system",
                "content": "You are a helpful questioning/awnsering AI. Provide the exact answer from the provided text and do not generate new text."
            },
    ],
    model="llama3-groq-8b-8192-tool-use-preview",
)
    return chat_completion.choices[0].message.content


def fextractURL(pdf_path):
    extracted_data = ""

    try:
        if pdf_path.endswith('.pdf'):
            # If the URL ends with .pdf, use pdfplumber directly
            r = requests.get(pdf_path)
            f = io.BytesIO(r.content)
            with pdfplumber.open(f) as pdf:
                for page in pdf.pages:
                    extracted_data += page.extract_text() + "\n"  # Extract text
                    tables = page.extract_tables()  # Extract tables
                    for table in tables:
                        for row in table:
                            extracted_data += "\t".join(str(cell) for cell in row) + "\n"
        else:
            # If the URL does not end with .pdf, download the PDF first
            response = requests.get(pdf_path)
            pdf_content = response.content

            # Save the PDF locally
            pdf_filename = 'downloaded_document.pdf'
            with open(pdf_filename, 'wb') as pdf_file:
                pdf_file.write(pdf_content)

            # Extract content using pdfplumber
            with pdfplumber.open(pdf_filename) as pdf:
                for page in pdf.pages:
                    extracted_data += page.extract_text() + "\n"  # Extract text
                    tables = page.extract_tables()  # Extract tables
                    for table in tables:
                        for row in table:
                            extracted_data += "\t".join(str(cell) for cell in row) + "\n"

            # Delete the PDF file
            os.remove(pdf_filename)
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")

    return extracted_data


vert_space = '<div style="padding: 3rem 1rem;"></div>'
st.markdown(vert_space, unsafe_allow_html=True)
st.write("Extract full text from PDF URL")

pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
questionText = st.text_input(label="prompt", value="", max_chars=None, key=None, type="default", help="""You can make your prompt very specific to get the desired output. For example, if you need an invoice number, you can format your prompt like this:

Example Prompt:
*Extract items with the following details:

Invoice Number: xxxx
Date: [Insert Date format]
Customer Name: 
Total Amount:
By providing clear and detailed information in your prompt, you'll receive an accurate and tailored response.""", autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
extractedText = st.empty()

if button:
    try:
        text = fextractURL(pdfURL)
        AItext = AImodel(text,questionText)
        extractedText.text(AItext)

    except Exception as e:
        st.error(f"An error occurred: {str(e)}")