import streamlit as st import io import requests import pdfplumber def fextractURL(pdf_path): extracted_data = "" if pdf_path.endswith('.pdf'): # If the URL ends with .pdf, use pdfplumber directly r = requests.get(pdf_path) f = io.BytesIO(r.content) with pdfplumber.open(f) as pdf: for page in pdf.pages: extracted_data += page.extract_text() + "\n" # Extract text tables = page.extract_tables() # Extract tables for table in tables: for row in table: extracted_data += "\t".join(str(cell) for cell in row) + "\n" else: # If the URL does not end with .pdf, download the PDF first response = requests.get(pdf_path) pdf_content = response.content # Save the PDF locally pdf_filename = 'downloaded_document.pdf' with open(pdf_filename, 'wb') as pdf_file: pdf_file.write(pdf_content) # Extract content using pdfplumber with pdfplumber.open(pdf_filename) as pdf: for page in pdf.pages: extracted_data += page.extract_text() + "\n" # Extract text tables = page.extract_tables() # Extract tables for table in tables: for row in table: extracted_data += "\t".join(str(cell) for cell in row) + "\n" # Delete the PDF file return extracted_data vert_space = '
' st.markdown(vert_space, unsafe_allow_html=True) st.write("Extarct full text from PDF url") pdfURL = st.text_input(label="origin URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible") button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False) extractedText = st.empty()