Spaces:
Sleeping
Sleeping
File size: 3,735 Bytes
cadae78 ea4451e f733827 ea4451e 1d0c7b7 16a77db ea4451e 1907239 ea4451e 440b73f cadae78 c45f030 ffd8879 ea4451e ffd8879 ea4451e ffd8879 cadae78 a673cfb 6ef59e8 ffd8879 cadae78 ffd8879 ea4451e cadae78 ea4451e cadae78 ea4451e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import streamlit as st
import io
import requests
import pdfplumber
import os
from groq import Groq
client = Groq(
api_key=os.getenv("groq_token"),
)
def AImodel(text,question):
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": f"extract {question} in this text:{text}",
}, {
"role": "system",
"content": "You are a helpful questioning/awnsering AI. Provide the exact answer from the provided text and do not generate new text."
},
],
model="llama3-groq-8b-8192-tool-use-preview",
)
return chat_completion.choices[0].message.content
def fextractURL(pdf_path):
extracted_data = ""
try:
if pdf_path.endswith('.pdf'):
# If the URL ends with .pdf, use pdfplumber directly
r = requests.get(pdf_path)
f = io.BytesIO(r.content)
with pdfplumber.open(f) as pdf:
for page in pdf.pages:
extracted_data += page.extract_text() + "\n" # Extract text
tables = page.extract_tables() # Extract tables
for table in tables:
for row in table:
extracted_data += "\t".join(str(cell) for cell in row) + "\n"
else:
# If the URL does not end with .pdf, download the PDF first
response = requests.get(pdf_path)
pdf_content = response.content
# Save the PDF locally
pdf_filename = 'downloaded_document.pdf'
with open(pdf_filename, 'wb') as pdf_file:
pdf_file.write(pdf_content)
# Extract content using pdfplumber
with pdfplumber.open(pdf_filename) as pdf:
for page in pdf.pages:
extracted_data += page.extract_text() + "\n" # Extract text
tables = page.extract_tables() # Extract tables
for table in tables:
for row in table:
extracted_data += "\t".join(str(cell) for cell in row) + "\n"
# Delete the PDF file
os.remove(pdf_filename)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
return extracted_data
vert_space = '<div style="padding: 3rem 1rem;"></div>'
st.markdown(vert_space, unsafe_allow_html=True)
st.write("Extract full text from PDF URL")
pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
questionText = st.text_input(label="prompt", value="", max_chars=None, key=None, type="default", help="""You can make your prompt very specific to get the desired output. For example, if you need an invoice number, you can format your prompt like this:
Example Prompt:
*Extract items with the following details:
Invoice Number: xxxx
Date: [Insert Date format]
Customer Name:
Total Amount:
By providing clear and detailed information in your prompt, you'll receive an accurate and tailored response.""", autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
extractedText = st.empty()
if button:
try:
text = fextractURL(pdfURL)
AItext = AImodel(text,questionText)
extractedText.text(AItext)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
|