import streamlit as st import io import requests import pdfplumber import os from groq import Groq client = Groq( api_key=os.getenv("groq_token"), ) def AImodel(text,question): chat_completion = client.chat.completions.create( messages=[ { "role": "user", "content": f"extract {question} in this text:{text}", }, { "role": "system", "content": "You are a helpful questioning/awnsering AI. Provide the exact answer from the provided text and do not generate new text." }, ], model="llama3-groq-8b-8192-tool-use-preview", ) return chat_completion.choices[0].message.content def fextractURL(pdf_path): extracted_data = "" try: if pdf_path.endswith('.pdf'): # If the URL ends with .pdf, use pdfplumber directly r = requests.get(pdf_path) f = io.BytesIO(r.content) with pdfplumber.open(f) as pdf: for page in pdf.pages: extracted_data += page.extract_text() + "\n" # Extract text tables = page.extract_tables() # Extract tables for table in tables: for row in table: extracted_data += "\t".join(str(cell) for cell in row) + "\n" else: # If the URL does not end with .pdf, download the PDF first response = requests.get(pdf_path) pdf_content = response.content # Save the PDF locally pdf_filename = 'downloaded_document.pdf' with open(pdf_filename, 'wb') as pdf_file: pdf_file.write(pdf_content) # Extract content using pdfplumber with pdfplumber.open(pdf_filename) as pdf: for page in pdf.pages: extracted_data += page.extract_text() + "\n" # Extract text tables = page.extract_tables() # Extract tables for table in tables: for row in table: extracted_data += "\t".join(str(cell) for cell in row) + "\n" # Delete the PDF file os.remove(pdf_filename) except Exception as e: st.error(f"An error occurred: {str(e)}") return extracted_data vert_space = '
' st.markdown(vert_space, unsafe_allow_html=True) st.write("Extract full text from PDF URL") pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible") questionText = st.text_input(label="prompt", value="", max_chars=None, key=None, type="default", help="""You can make your prompt very specific to get the desired output. For example, if you need an invoice number, you can format your prompt like this: Example Prompt: *Extract items with the following details: Invoice Number: xxxx Date: [Insert Date format] Customer Name: Total Amount: By providing clear and detailed information in your prompt, you'll receive an accurate and tailored response.""", autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible") button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False) extractedText = st.empty() if button: try: text = fextractURL(pdfURL) AItext = AImodel(text,questionText) extractedText.text(AItext) except Exception as e: st.error(f"An error occurred: {str(e)}")