ShayanRl commited on
Commit
159e468
·
verified ·
1 Parent(s): 36a28df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -47
app.py CHANGED
@@ -1,62 +1,84 @@
1
  import streamlit as st
2
- import io
3
  import requests
4
  import pdfplumber
5
  import os
 
6
 
7
- def fextractURL(pdf_path):
8
- extracted_data = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
 
 
 
 
 
 
10
  try:
11
- if pdf_path.endswith('.pdf'):
12
- # If the URL ends with .pdf, use pdfplumber directly
13
- r = requests.get(pdf_path)
14
- f = io.BytesIO(r.content)
15
- with pdfplumber.open(f) as pdf:
16
- for page in pdf.pages:
17
- extracted_data += page.extract_text() + "\n" # Extract text
18
- tables = page.extract_tables() # Extract tables
19
- for table in tables:
20
- for row in table:
21
- extracted_data += "\t".join(str(cell) for cell in row) + "\n"
22
- else:
23
- # If the URL does not end with .pdf, download the PDF first
24
- response = requests.get(pdf_path)
25
- pdf_content = response.content
26
-
27
- # Save the PDF locally
28
- pdf_filename = 'downloaded_document.pdf'
29
- with open(pdf_filename, 'wb') as pdf_file:
30
- pdf_file.write(pdf_content)
31
-
32
- # Extract content using pdfplumber
33
- with pdfplumber.open(pdf_filename) as pdf:
34
- for page in pdf.pages:
35
- extracted_data += page.extract_text() + "\n" # Extract text
36
- tables = page.extract_tables() # Extract tables
37
- for table in tables:
38
- for row in table:
39
- extracted_data += "\t".join(str(cell) for cell in row) + "\n"
40
-
41
- # Delete the PDF file
42
- os.remove(pdf_filename)
43
  except Exception as e:
44
- st.error(f"An error occurred: {str(e)}")
45
 
46
- return extracted_data
 
 
 
 
 
 
 
47
 
 
48
 
49
  vert_space = '<div style="padding: 3rem 1rem;"></div>'
50
  st.markdown(vert_space, unsafe_allow_html=True)
51
- st.write("Extract full text from PDF URL")
 
52
 
53
- pdfURL = st.text_input(label="PDF URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
54
- button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
55
- extractedText = st.empty()
56
 
57
- if button:
58
- try:
59
- text = fextractURL(pdfURL)
60
- extractedText.text(text)
61
- except Exception as e:
62
- st.error(f"An error occurred: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
  import requests
3
  import pdfplumber
4
  import os
5
+ import fitz # PyMuPDF
6
 
7
+ def download_pdf(pdf_path):
8
+ """Downloads PDF from URL or returns local path if it exists."""
9
+ if os.path.isfile(pdf_path):
10
+ return pdf_path
11
+
12
+ try:
13
+ response = requests.get(pdf_path)
14
+ response.raise_for_status()
15
+
16
+ pdf_filename = 'downloaded_document.pdf'
17
+ with open(pdf_filename, 'wb') as pdf_file:
18
+ pdf_file.write(response.content)
19
+ return pdf_filename
20
+ except Exception as e:
21
+ st.error(f"Error downloading PDF: {e}")
22
+ return None
23
 
24
+ def extract_content(pdf_path):
25
+ """Extracts raw text using pdfplumber and HTML using PyMuPDF."""
26
+ text_data = ""
27
+ html_data = ""
28
+
29
+ # 1. Extract Raw Text using pdfplumber (keeping existing logic)
30
  try:
31
+ with pdfplumber.open(pdf_path) as pdf:
32
+ for page in pdf.pages:
33
+ text_data += (page.extract_text() or "") + "\n"
34
+ tables = page.extract_tables()
35
+ for table in tables:
36
+ for row in table:
37
+ # Handle None cells in tables
38
+ row_text = "\t".join(str(cell) if cell is not None else "" for cell in row)
39
+ text_data += row_text + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  except Exception as e:
41
+ st.error(f"Error extracting text with pdfplumber: {e}")
42
 
43
+ # 2. Extract HTML using PyMuPDF (fitz)
44
+ try:
45
+ doc = fitz.open(pdf_path)
46
+ for page in doc:
47
+ html_data += page.get_text("html")
48
+ doc.close()
49
+ except Exception as e:
50
+ st.error(f"Error extracting HTML with PyMuPDF: {e}")
51
 
52
+ return text_data, html_data
53
 
54
  vert_space = '<div style="padding: 3rem 1rem;"></div>'
55
  st.markdown(vert_space, unsafe_allow_html=True)
56
+ st.title("PDF Content Scraper")
57
+ st.write("Extract full text and HTML from PDF URL")
58
 
59
+ pdfURL = st.text_input(label="PDF URL", value="", placeholder="Enter PDF URL here")
60
+ button = st.button(label='Extract')
 
61
 
62
+ if button and pdfURL:
63
+ with st.spinner("Downloading and extracting..."):
64
+ local_pdf = download_pdf(pdfURL)
65
+
66
+ if local_pdf:
67
+ text, html = extract_content(local_pdf)
68
+
69
+ # Clean up downloaded file if it was downloaded
70
+ if local_pdf == 'downloaded_document.pdf' and os.path.exists(local_pdf):
71
+ os.remove(local_pdf)
72
+
73
+ st.subheader("Raw Text Content")
74
+ st.text_area("Extracted Text", text, height=300)
75
+
76
+ st.subheader("HTML Content")
77
+ st.download_button(
78
+ label="Download HTML",
79
+ data=html,
80
+ file_name="extracted_content.html",
81
+ mime="text/html"
82
+ )
83
+ with st.expander("View HTML Source"):
84
+ st.code(html, language='html')