Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import io | |
| import requests | |
| import pdfplumber | |
| import os | |
| from groq import Groq | |
| client = Groq( | |
| api_key=os.getenv("groq_token"), | |
| ) | |
| def AImodel(text,question): | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": f"extract {question} in this text:{text}", | |
| }, { | |
| "role": "system", | |
| "content": "You are a helpful questioning/awnsering AI. Provide the exact answer from the provided text and do not generate new text." | |
| }, | |
| ], | |
| model="llama3-groq-8b-8192-tool-use-preview", | |
| ) | |
| return chat_completion.choices[0].message.content | |
| def fextractURL(pdf_path): | |
| extracted_data = "" | |
| try: | |
| if pdf_path.endswith('.pdf'): | |
| # If the URL ends with .pdf, use pdfplumber directly | |
| r = requests.get(pdf_path) | |
| f = io.BytesIO(r.content) | |
| with pdfplumber.open(f) as pdf: | |
| for page in pdf.pages: | |
| extracted_data += page.extract_text() + "\n" # Extract text | |
| tables = page.extract_tables() # Extract tables | |
| for table in tables: | |
| for row in table: | |
| extracted_data += "\t".join(str(cell) for cell in row) + "\n" | |
| else: | |
| # If the URL does not end with .pdf, download the PDF first | |
| response = requests.get(pdf_path) | |
| pdf_content = response.content | |
| # Save the PDF locally | |
| pdf_filename = 'downloaded_document.pdf' | |
| with open(pdf_filename, 'wb') as pdf_file: | |
| pdf_file.write(pdf_content) | |
| # Extract content using pdfplumber | |
| with pdfplumber.open(pdf_filename) as pdf: | |
| for page in pdf.pages: | |
| extracted_data += page.extract_text() + "\n" # Extract text | |
| tables = page.extract_tables() # Extract tables | |
| for table in tables: | |
| for row in table: | |
| extracted_data += "\t".join(str(cell) for cell in row) + "\n" | |
| # Delete the PDF file | |
| os.remove(pdf_filename) | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |
| return extracted_data | |
| vert_space = '<div style="padding: 3rem 1rem;"></div>' | |
| st.markdown(vert_space, unsafe_allow_html=True) | |
| st.write("Extract full text from PDF URL") | |
| pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible") | |
| questionText = st.text_input(label="prompt", value="", max_chars=None, key=None, type="default", help="""You can make your prompt very specific to get the desired output. For example, if you need an invoice number, you can format your prompt like this: | |
| Example Prompt: | |
| *Extract items with the following details: | |
| Invoice Number: xxxx | |
| Date: [Insert Date format] | |
| Customer Name: | |
| Total Amount: | |
| By providing clear and detailed information in your prompt, you'll receive an accurate and tailored response.""", autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible") | |
| button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False) | |
| extractedText = st.empty() | |
| if button: | |
| try: | |
| text = fextractURL(pdfURL) | |
| AItext = AImodel(text,questionText) | |
| extractedText.text(AItext) | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |