Upload 2 files
Browse files
app.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
import io
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
import pdfplumber
|
| 7 |
+
|
| 8 |
+
def fextractURL(pdf_path):
|
| 9 |
+
r = requests.get(pdf_path)
|
| 10 |
+
f = io.BytesIO(r.content)
|
| 11 |
+
extracted_data = ""
|
| 12 |
+
with pdfplumber.open(f) as pdf:
|
| 13 |
+
for page in pdf.pages:
|
| 14 |
+
extracted_data += page.extract_text() + "\n" # Extract text
|
| 15 |
+
tables = page.extract_tables() # Extract tables
|
| 16 |
+
for table in tables:
|
| 17 |
+
for row in table:
|
| 18 |
+
extracted_data += "\t".join(str(cell) for cell in row) + "\n"
|
| 19 |
+
return extracted_data
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
st.write("Extarct full text from PDF url")
|
| 25 |
+
|
| 26 |
+
pdfURL = st.text_input(label="origin URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
|
| 27 |
+
button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
|
| 28 |
+
extractedText = st.empty()
|
| 29 |
+
|
| 30 |
+
if button:
|
| 31 |
+
try:
|
| 32 |
+
text = fextractURL(pdfURL)
|
| 33 |
+
extractedText.text(text)
|
| 34 |
+
except Exception as e:
|
| 35 |
+
st.error(f"An error occurred: {str(e)}")
|
main.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
import pdfplumber
|
| 5 |
+
|
| 6 |
+
def fextractURL(pdf_path):
|
| 7 |
+
r = requests.get(pdf_path)
|
| 8 |
+
f = io.BytesIO(r.content)
|
| 9 |
+
extracted_data = ""
|
| 10 |
+
with pdfplumber.open(f) as pdf:
|
| 11 |
+
for page in pdf.pages:
|
| 12 |
+
extracted_data += page.extract_text() + "\n" # Extract text
|
| 13 |
+
tables = page.extract_tables() # Extract tables
|
| 14 |
+
for table in tables:
|
| 15 |
+
for row in table:
|
| 16 |
+
extracted_data += "\t".join(str(cell) for cell in row) + "\n"
|
| 17 |
+
return extracted_data
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|