File size: 2,061 Bytes
cadae78
2215918
5405dd5
2215918
5405dd5
 
2215918
cadae78
5405dd5
 
2215918
5405dd5
 
 
 
 
 
 
 
159e468
5405dd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2215918
cadae78
 
159e468
5405dd5
 
 
 
 
 
2215918
5405dd5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import streamlit as st

import io

import requests
import pdfplumber


def fextractURL(pdf_path):
    extracted_data = ""

    if pdf_path.endswith('.pdf'):
        # If the URL ends with .pdf, use pdfplumber directly
        r = requests.get(pdf_path)
        f = io.BytesIO(r.content)
        with pdfplumber.open(f) as pdf:
            for page in pdf.pages:
                extracted_data += page.extract_text() + "\n"  # Extract text
                tables = page.extract_tables()  # Extract tables
                for table in tables:
                    for row in table:
                        extracted_data += "\t".join(str(cell) for cell in row) + "\n"
    else:
        # If the URL does not end with .pdf, download the PDF first
        response = requests.get(pdf_path)
        pdf_content = response.content

        # Save the PDF locally
        pdf_filename = 'downloaded_document.pdf'
        with open(pdf_filename, 'wb') as pdf_file:
            pdf_file.write(pdf_content)

        # Extract content using pdfplumber
        with pdfplumber.open(pdf_filename) as pdf:
            for page in pdf.pages:
                extracted_data += page.extract_text() + "\n"  # Extract text
                tables = page.extract_tables()  # Extract tables
                for table in tables:
                    for row in table:
                        extracted_data += "\t".join(str(cell) for cell in row) + "\n"

        # Delete the PDF file
       
    



    return extracted_data


vert_space = '<div style="padding: 3rem 1rem;"></div>'
st.markdown(vert_space, unsafe_allow_html=True)
st.write("Extarct full text from PDF url")

pdfURL = st.text_input(label="origin URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
extractedText = st.empty()