ShayanRl commited on
Commit
ffd8879
·
verified ·
1 Parent(s): 1f4c254

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -37
app.py CHANGED
@@ -1,54 +1,56 @@
1
  import streamlit as st
2
-
3
  import io
4
-
5
  import requests
6
  import pdfplumber
 
7
 
8
  def fextractURL(pdf_path):
9
  extracted_data = ""
10
 
11
- if pdf_path.endswith('.pdf'):
12
- # If the URL ends with .pdf, use pdfplumber directly
13
- r = requests.get(pdf_path)
14
- f = io.BytesIO(r.content)
15
- with pdfplumber.open(f) as pdf:
16
- for page in pdf.pages:
17
- extracted_data += page.extract_text() + "\n" # Extract text
18
- tables = page.extract_tables() # Extract tables
19
- for table in tables:
20
- for row in table:
21
- extracted_data += "\t".join(str(cell) for cell in row) + "\n"
22
- else:
23
- # If the URL does not end with .pdf, download the PDF first
24
- response = requests.get(pdf_path)
25
- pdf_content = response.content
26
-
27
- # Save the PDF locally
28
- pdf_filename = 'downloaded_document.pdf'
29
- with open(pdf_filename, 'wb') as pdf_file:
30
- pdf_file.write(pdf_content)
31
-
32
- # Extract content using pdfplumber
33
- with pdfplumber.open(pdf_filename) as pdf:
34
- for page in pdf.pages:
35
- extracted_data += page.extract_text() + "\n" # Extract text
36
- tables = page.extract_tables() # Extract tables
37
- for table in tables:
38
- for row in table:
39
- extracted_data += "\t".join(str(cell) for cell in row) + "\n"
40
-
41
- # Delete the PDF file
42
-
43
-
 
 
 
44
  return extracted_data
45
 
46
 
47
  vert_space = '<div style="padding: 3rem 1rem;"></div>'
48
  st.markdown(vert_space, unsafe_allow_html=True)
49
- st.write("Extarct full text from PDF url")
50
 
51
- pdfURL = st.text_input(label="origin URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
52
  button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
53
  extractedText = st.empty()
54
 
 
1
  import streamlit as st
 
2
  import io
 
3
  import requests
4
  import pdfplumber
5
+ import os
6
 
7
  def fextractURL(pdf_path):
8
  extracted_data = ""
9
 
10
+ try:
11
+ if pdf_path.endswith('.pdf'):
12
+ # If the URL ends with .pdf, use pdfplumber directly
13
+ r = requests.get(pdf_path)
14
+ f = io.BytesIO(r.content)
15
+ with pdfplumber.open(f) as pdf:
16
+ for page in pdf.pages:
17
+ extracted_data += page.extract_text() + "\n" # Extract text
18
+ tables = page.extract_tables() # Extract tables
19
+ for table in tables:
20
+ for row in table:
21
+ extracted_data += "\t".join(str(cell) for cell in row) + "\n"
22
+ else:
23
+ # If the URL does not end with .pdf, download the PDF first
24
+ response = requests.get(pdf_path)
25
+ pdf_content = response.content
26
+
27
+ # Save the PDF locally
28
+ pdf_filename = 'downloaded_document.pdf'
29
+ with open(pdf_filename, 'wb') as pdf_file:
30
+ pdf_file.write(pdf_content)
31
+
32
+ # Extract content using pdfplumber
33
+ with pdfplumber.open(pdf_filename) as pdf:
34
+ for page in pdf.pages:
35
+ extracted_data += page.extract_text() + "\n" # Extract text
36
+ tables = page.extract_tables() # Extract tables
37
+ for table in tables:
38
+ for row in table:
39
+ extracted_data += "\t".join(str(cell) for cell in row) + "\n"
40
+
41
+ # Delete the PDF file
42
+ os.remove(pdf_filename)
43
+ except Exception as e:
44
+ st.error(f"An error occurred: {str(e)}")
45
+
46
  return extracted_data
47
 
48
 
49
  vert_space = '<div style="padding: 3rem 1rem;"></div>'
50
  st.markdown(vert_space, unsafe_allow_html=True)
51
+ st.write("Extract full text from PDF URL")
52
 
53
+ pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
54
  button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
55
  extractedText = st.empty()
56