pdf2AIextract

Sleeping

App Files Files Community

pdf2AIextract / app.py

ShayanRl

Update app.py

1907239 verified over 1 year ago

raw

history blame contribute delete

3.74 kB

	import streamlit as st
	import io
	import requests
	import pdfplumber
	import os
	from groq import Groq

	client = Groq(
	api_key=os.getenv("groq_token"),
	)



	def AImodel(text,question):
	chat_completion = client.chat.completions.create(
	messages=[
	{
	"role": "user",
	"content": f"extract {question} in this text:{text}",
	}, {
	"role": "system",
	"content": "You are a helpful questioning/awnsering AI. Provide the exact answer from the provided text and do not generate new text."
	},
	],
	model="llama3-groq-8b-8192-tool-use-preview",
	)
	return chat_completion.choices[0].message.content


	def fextractURL(pdf_path):
	extracted_data = ""

	try:
	if pdf_path.endswith('.pdf'):
	# If the URL ends with .pdf, use pdfplumber directly
	r = requests.get(pdf_path)
	f = io.BytesIO(r.content)
	with pdfplumber.open(f) as pdf:
	for page in pdf.pages:
	extracted_data += page.extract_text() + "\n" # Extract text
	tables = page.extract_tables() # Extract tables
	for table in tables:
	for row in table:
	extracted_data += "\t".join(str(cell) for cell in row) + "\n"
	else:
	# If the URL does not end with .pdf, download the PDF first
	response = requests.get(pdf_path)
	pdf_content = response.content

	# Save the PDF locally
	pdf_filename = 'downloaded_document.pdf'
	with open(pdf_filename, 'wb') as pdf_file:
	pdf_file.write(pdf_content)

	# Extract content using pdfplumber
	with pdfplumber.open(pdf_filename) as pdf:
	for page in pdf.pages:
	extracted_data += page.extract_text() + "\n" # Extract text
	tables = page.extract_tables() # Extract tables
	for table in tables:
	for row in table:
	extracted_data += "\t".join(str(cell) for cell in row) + "\n"

	# Delete the PDF file
	os.remove(pdf_filename)
	except Exception as e:
	st.error(f"An error occurred: {str(e)}")

	return extracted_data


	vert_space = '<div style="padding: 3rem 1rem;"></div>'
	st.markdown(vert_space, unsafe_allow_html=True)
	st.write("Extract full text from PDF URL")

	pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
	questionText = st.text_input(label="prompt", value="", max_chars=None, key=None, type="default", help="""You can make your prompt very specific to get the desired output. For example, if you need an invoice number, you can format your prompt like this:

	Example Prompt:
	*Extract items with the following details:

	Invoice Number: xxxx
	Date: [Insert Date format]
	Customer Name:
	Total Amount:
	By providing clear and detailed information in your prompt, you'll receive an accurate and tailored response.""", autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
	button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
	extractedText = st.empty()

	if button:
	try:
	text = fextractURL(pdfURL)
	AItext = AImodel(text,questionText)
	extractedText.text(AItext)

	except Exception as e:
	st.error(f"An error occurred: {str(e)}")