pdf2AIextract / app.py
ShayanRl's picture
Update app.py
1907239 verified
import streamlit as st
import io
import requests
import pdfplumber
import os
from groq import Groq
client = Groq(
api_key=os.getenv("groq_token"),
)
def AImodel(text,question):
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": f"extract {question} in this text:{text}",
}, {
"role": "system",
"content": "You are a helpful questioning/awnsering AI. Provide the exact answer from the provided text and do not generate new text."
},
],
model="llama3-groq-8b-8192-tool-use-preview",
)
return chat_completion.choices[0].message.content
def fextractURL(pdf_path):
extracted_data = ""
try:
if pdf_path.endswith('.pdf'):
# If the URL ends with .pdf, use pdfplumber directly
r = requests.get(pdf_path)
f = io.BytesIO(r.content)
with pdfplumber.open(f) as pdf:
for page in pdf.pages:
extracted_data += page.extract_text() + "\n" # Extract text
tables = page.extract_tables() # Extract tables
for table in tables:
for row in table:
extracted_data += "\t".join(str(cell) for cell in row) + "\n"
else:
# If the URL does not end with .pdf, download the PDF first
response = requests.get(pdf_path)
pdf_content = response.content
# Save the PDF locally
pdf_filename = 'downloaded_document.pdf'
with open(pdf_filename, 'wb') as pdf_file:
pdf_file.write(pdf_content)
# Extract content using pdfplumber
with pdfplumber.open(pdf_filename) as pdf:
for page in pdf.pages:
extracted_data += page.extract_text() + "\n" # Extract text
tables = page.extract_tables() # Extract tables
for table in tables:
for row in table:
extracted_data += "\t".join(str(cell) for cell in row) + "\n"
# Delete the PDF file
os.remove(pdf_filename)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
return extracted_data
vert_space = '<div style="padding: 3rem 1rem;"></div>'
st.markdown(vert_space, unsafe_allow_html=True)
st.write("Extract full text from PDF URL")
pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
questionText = st.text_input(label="prompt", value="", max_chars=None, key=None, type="default", help="""You can make your prompt very specific to get the desired output. For example, if you need an invoice number, you can format your prompt like this:
Example Prompt:
*Extract items with the following details:
Invoice Number: xxxx
Date: [Insert Date format]
Customer Name:
Total Amount:
By providing clear and detailed information in your prompt, you'll receive an accurate and tailored response.""", autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
extractedText = st.empty()
if button:
try:
text = fextractURL(pdfURL)
AItext = AImodel(text,questionText)
extractedText.text(AItext)
except Exception as e:
st.error(f"An error occurred: {str(e)}")