ShayanRl commited on
Commit
cadae78
·
verified ·
1 Parent(s): 1c86080

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +35 -0
  2. main.py +26 -0
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ import io
4
+
5
+ import requests
6
+ import pdfplumber
7
+
8
+ def fextractURL(pdf_path):
9
+ r = requests.get(pdf_path)
10
+ f = io.BytesIO(r.content)
11
+ extracted_data = ""
12
+ with pdfplumber.open(f) as pdf:
13
+ for page in pdf.pages:
14
+ extracted_data += page.extract_text() + "\n" # Extract text
15
+ tables = page.extract_tables() # Extract tables
16
+ for table in tables:
17
+ for row in table:
18
+ extracted_data += "\t".join(str(cell) for cell in row) + "\n"
19
+ return extracted_data
20
+
21
+
22
+
23
+
24
+ st.write("Extarct full text from PDF url")
25
+
26
+ pdfURL = st.text_input(label="origin URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
27
+ button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
28
+ extractedText = st.empty()
29
+
30
+ if button:
31
+ try:
32
+ text = fextractURL(pdfURL)
33
+ extractedText.text(text)
34
+ except Exception as e:
35
+ st.error(f"An error occurred: {str(e)}")
main.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+
3
+ import requests
4
+ import pdfplumber
5
+
6
+ def fextractURL(pdf_path):
7
+ r = requests.get(pdf_path)
8
+ f = io.BytesIO(r.content)
9
+ extracted_data = ""
10
+ with pdfplumber.open(f) as pdf:
11
+ for page in pdf.pages:
12
+ extracted_data += page.extract_text() + "\n" # Extract text
13
+ tables = page.extract_tables() # Extract tables
14
+ for table in tables:
15
+ for row in table:
16
+ extracted_data += "\t".join(str(cell) for cell in row) + "\n"
17
+ return extracted_data
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+