File size: 3,735 Bytes
cadae78
 
 
ea4451e
 
 
f733827
ea4451e
 
 
1d0c7b7
16a77db
 
ea4451e
 
 
 
 
 
 
 
1907239
ea4451e
 
 
 
 
440b73f
cadae78
 
 
c45f030
ffd8879
 
 
 
 
ea4451e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffd8879
ea4451e
ffd8879
cadae78
 
 
a673cfb
6ef59e8
ffd8879
cadae78
ffd8879
ea4451e
 
 
 
 
 
 
 
 
 
cadae78
 
 
 
 
 
ea4451e
 
 
cadae78
ea4451e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import streamlit as st
import io
import requests
import pdfplumber
import os
from groq import Groq

client = Groq(
    api_key=os.getenv("groq_token"),
)



def AImodel(text,question):
    chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": f"extract {question} in this text:{text}",
        }, {
                "role": "system",
                "content": "You are a helpful questioning/awnsering AI. Provide the exact answer from the provided text and do not generate new text."
            },
    ],
    model="llama3-groq-8b-8192-tool-use-preview",
)
    return chat_completion.choices[0].message.content


def fextractURL(pdf_path):
    extracted_data = ""

    try:
        if pdf_path.endswith('.pdf'):
            # If the URL ends with .pdf, use pdfplumber directly
            r = requests.get(pdf_path)
            f = io.BytesIO(r.content)
            with pdfplumber.open(f) as pdf:
                for page in pdf.pages:
                    extracted_data += page.extract_text() + "\n"  # Extract text
                    tables = page.extract_tables()  # Extract tables
                    for table in tables:
                        for row in table:
                            extracted_data += "\t".join(str(cell) for cell in row) + "\n"
        else:
            # If the URL does not end with .pdf, download the PDF first
            response = requests.get(pdf_path)
            pdf_content = response.content

            # Save the PDF locally
            pdf_filename = 'downloaded_document.pdf'
            with open(pdf_filename, 'wb') as pdf_file:
                pdf_file.write(pdf_content)

            # Extract content using pdfplumber
            with pdfplumber.open(pdf_filename) as pdf:
                for page in pdf.pages:
                    extracted_data += page.extract_text() + "\n"  # Extract text
                    tables = page.extract_tables()  # Extract tables
                    for table in tables:
                        for row in table:
                            extracted_data += "\t".join(str(cell) for cell in row) + "\n"

            # Delete the PDF file
            os.remove(pdf_filename)
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")

    return extracted_data


vert_space = '<div style="padding: 3rem 1rem;"></div>'
st.markdown(vert_space, unsafe_allow_html=True)
st.write("Extract full text from PDF URL")

pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
questionText = st.text_input(label="prompt", value="", max_chars=None, key=None, type="default", help="""You can make your prompt very specific to get the desired output. For example, if you need an invoice number, you can format your prompt like this:

Example Prompt:
*Extract items with the following details:

Invoice Number: xxxx
Date: [Insert Date format]
Customer Name: 
Total Amount:
By providing clear and detailed information in your prompt, you'll receive an accurate and tailored response.""", autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
extractedText = st.empty()

if button:
    try:
        text = fextractURL(pdfURL)
        AItext = AImodel(text,questionText)
        extractedText.text(AItext)

    except Exception as e:
        st.error(f"An error occurred: {str(e)}")