ShayanRl commited on
Commit
2215918
Β·
verified Β·
1 Parent(s): ee4cf98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +330 -66
app.py CHANGED
@@ -1,82 +1,346 @@
1
  import streamlit as st
2
- import requests
3
  import pdfplumber
 
 
 
 
 
 
4
  import os
5
- import fitz # PyMuPDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- def download_pdf(pdf_path):
8
- """Downloads PDF from URL or returns local path if it exists."""
9
- if os.path.isfile(pdf_path):
10
- return pdf_path
11
 
12
- try:
13
- response = requests.get(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  response.raise_for_status()
15
 
16
- pdf_filename = 'downloaded_document.pdf'
17
- with open(pdf_filename, 'wb') as pdf_file:
18
- pdf_file.write(response.content)
19
- return pdf_filename
20
- except Exception as e:
21
- st.error(f"Error downloading PDF: {e}")
22
- return None
23
-
24
- def extract_content(pdf_path):
25
- """Extracts raw text using pdfplumber and HTML using PyMuPDF."""
26
- text_data = ""
27
- html_data = ""
28
 
29
- # 1. Extract Raw Text using pdfplumber (keeping existing logic)
30
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  with pdfplumber.open(pdf_path) as pdf:
32
- for page in pdf.pages:
33
- text_data += (page.extract_text() or "") + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  tables = page.extract_tables()
35
  for table in tables:
36
- for row in table:
37
- # Handle None cells in tables
38
- row_text = "\t".join(str(cell) if cell is not None else "" for cell in row)
39
- text_data += row_text + "\n"
40
- except Exception as e:
41
- st.error(f"Error extracting text with pdfplumber: {e}")
42
-
43
- # 2. Extract HTML using PyMuPDF (fitz)
44
- try:
45
- doc = fitz.open(pdf_path)
46
- for page in doc:
47
- html_data += page.get_text("html")
48
- doc.close()
49
- except Exception as e:
50
- st.error(f"Error extracting HTML with PyMuPDF: {e}")
51
-
52
- return text_data, html_data
53
-
54
- vert_space = '<div style="padding: 3rem 1rem;"></div>'
55
- st.markdown(vert_space, unsafe_allow_html=True)
56
- st.title("PDF Content Scraper")
57
- st.write("Extract full text and HTML from PDF URL")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- pdfURL = st.text_input(label="PDF URL", value="", placeholder="Enter PDF URL here")
60
- button = st.button(label='Extract')
61
 
62
- if button and pdfURL:
63
- with st.spinner("Downloading and extracting..."):
64
- local_pdf = download_pdf(pdfURL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- if local_pdf:
67
- text, html = extract_content(local_pdf)
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- # Clean up downloaded file if it was downloaded
70
- if local_pdf == 'downloaded_document.pdf' and os.path.exists(local_pdf):
71
- os.remove(local_pdf)
 
 
 
 
72
 
73
- st.subheader("Raw Text Content")
74
- st.text_area("Extracted Text", text, height=300)
75
-
76
- st.subheader("HTML Content")
77
- st.write("Rendered HTML Preview:")
78
- import streamlit.components.v1 as components
79
- components.html(html, height=600, scrolling=True)
80
-
81
- with st.expander("View HTML Source"):
82
- st.code(html, language='html')
 
1
  import streamlit as st
 
2
  import pdfplumber
3
+ import pandas as pd
4
+ from bs4 import BeautifulSoup
5
+ import re
6
+ import json
7
+ import requests
8
+ import tempfile
9
  import os
10
+ from typing import List, Dict, Any, Union
11
+ from urllib.parse import urlparse, unquote
12
+ import html
13
+
14
+ try:
15
+ import fitz # PyMuPDF
16
+ PYMUPDF_AVAILABLE = True
17
+ except ImportError:
18
+ PYMUPDF_AVAILABLE = False
19
+ print("PyMuPDF not available, using pdfplumber only")
20
+
21
+ from lxml import html as lxml_html, etree
22
+ from dataclasses import dataclass, asdict
23
+
24
+
25
+ @dataclass
26
+ class PDFElement:
27
+ """Represents an element extracted from PDF"""
28
+ type: str
29
+ content: Any
30
+ page: int
31
+ bbox: tuple = None
32
+ style: Dict = None
33
+ level: int = None
34
 
35
+
36
+ class PDFProcessor:
37
+ """Simplified PDF processor"""
 
38
 
39
+ def __init__(self):
40
+ self.elements = []
41
+ self.html_content = ""
42
+ self.element_counter = 0
43
+
44
+ def process_pdf(self, pdf_url: str) -> Dict:
45
+ """Process PDF from URL"""
46
+ temp_file = None
47
+
48
+ try:
49
+ temp_file = self._download_pdf(pdf_url)
50
+
51
+ # Extract content
52
+ self.elements = self._extract_content(temp_file)
53
+ self.html_content = self._convert_to_html()
54
+
55
+ # Get summary
56
+ summary = {
57
+ 'total_elements': len(self.elements),
58
+ 'pages': max([e.page for e in self.elements]) if self.elements else 0,
59
+ 'headings': len([e for e in self.elements if e.type == 'heading']),
60
+ 'tables': len([e for e in self.elements if e.type == 'table']),
61
+ 'paragraphs': len([e for e in self.elements if e.type == 'paragraph'])
62
+ }
63
+
64
+ return summary
65
+
66
+ finally:
67
+ if temp_file and os.path.exists(temp_file):
68
+ try:
69
+ os.unlink(temp_file)
70
+ except:
71
+ pass
72
+
73
+ def _download_pdf(self, url: str) -> str:
74
+ """Download PDF from URL"""
75
+ headers = {
76
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
77
+ }
78
+
79
+ response = requests.get(url, headers=headers, timeout=30)
80
  response.raise_for_status()
81
 
82
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
83
+ temp_file.write(response.content)
84
+ temp_file.close()
85
+
86
+ return temp_file.name
87
+
88
+ def _get_element_id(self, element_type: str) -> str:
89
+ """Generate unique ID for element"""
90
+ self.element_counter += 1
91
+ return f"{element_type}-{self.element_counter}"
 
 
92
 
93
+ def _extract_content(self, pdf_path: str) -> List[PDFElement]:
94
+ """Extract structured content from PDF"""
95
+ elements = []
96
+
97
+ if PYMUPDF_AVAILABLE:
98
+ try:
99
+ # Try PyMuPDF first for better structure detection
100
+ doc = fitz.open(pdf_path)
101
+
102
+ for page_num, page in enumerate(doc, 1):
103
+ blocks = page.get_text("dict")
104
+
105
+ for block in blocks["blocks"]:
106
+ if block["type"] == 0: # Text block
107
+ for line in block["lines"]:
108
+ for span in line["spans"]:
109
+ text = span["text"].strip()
110
+ if not text:
111
+ continue
112
+
113
+ font_size = span["size"]
114
+
115
+ # Simple classification
116
+ if font_size > 14:
117
+ element_type = "heading"
118
+ level = 1 if font_size > 18 else 2
119
+ elif re.match(r'^[\d\-\β€’\*]+\.?\s+', text):
120
+ element_type = "list"
121
+ level = None
122
+ else:
123
+ element_type = "paragraph"
124
+ level = None
125
+
126
+ elements.append(PDFElement(
127
+ type=element_type,
128
+ content=text,
129
+ page=page_num,
130
+ level=level
131
+ ))
132
+
133
+ doc.close()
134
+
135
+ # Also get tables with pdfplumber
136
+ with pdfplumber.open(pdf_path) as pdf:
137
+ for page_num, page in enumerate(pdf.pages, 1):
138
+ tables = page.extract_tables()
139
+ for table in tables:
140
+ if table:
141
+ elements.append(PDFElement(
142
+ type="table",
143
+ content=table,
144
+ page=page_num
145
+ ))
146
+
147
+ return elements
148
+
149
+ except Exception as e:
150
+ print(f"PyMuPDF failed: {e}, falling back to pdfplumber")
151
+
152
+ # Fallback to pdfplumber only
153
  with pdfplumber.open(pdf_path) as pdf:
154
+ for page_num, page in enumerate(pdf.pages, 1):
155
+ text = page.extract_text() or ""
156
+ lines = text.split('\n')
157
+
158
+ for line in lines:
159
+ line = line.strip()
160
+ if not line:
161
+ continue
162
+
163
+ if line.isupper() and len(line) < 100:
164
+ element_type = "heading"
165
+ level = 1
166
+ elif re.match(r'^[\d\-\β€’\*]+\.?\s+', line):
167
+ element_type = "list"
168
+ level = None
169
+ else:
170
+ element_type = "paragraph"
171
+ level = None
172
+
173
+ elements.append(PDFElement(
174
+ type=element_type,
175
+ content=line,
176
+ page=page_num,
177
+ level=level
178
+ ))
179
+
180
+ # Extract tables
181
  tables = page.extract_tables()
182
  for table in tables:
183
+ if table:
184
+ elements.append(PDFElement(
185
+ type="table",
186
+ content=table,
187
+ page=page_num
188
+ ))
189
+
190
+ return elements
191
+
192
+ def _convert_to_html(self) -> str:
193
+ """Convert elements to HTML with IDs"""
194
+ html_parts = ['''
195
+ <!DOCTYPE html>
196
+ <html>
197
+ <head>
198
+ <meta charset="UTF-8">
199
+ <style>
200
+ body {
201
+ font-family: Arial, sans-serif;
202
+ line-height: 1.6;
203
+ padding: 20px;
204
+ max-width: 900px;
205
+ margin: 0 auto;
206
+ }
207
+ h1, h2, h3 { color: #333; margin-top: 20px; }
208
+ table {
209
+ border-collapse: collapse;
210
+ width: 100%;
211
+ margin: 20px 0;
212
+ }
213
+ th, td {
214
+ border: 1px solid #ddd;
215
+ padding: 8px;
216
+ text-align: left;
217
+ }
218
+ th { background-color: #f2f2f2; }
219
+ p { margin: 10px 0; }
220
+ li { margin: 5px 0; }
221
+ .page-marker {
222
+ color: #888;
223
+ font-size: 0.9em;
224
+ margin-top: 30px;
225
+ padding-top: 10px;
226
+ border-top: 2px solid #eee;
227
+ }
228
+ </style>
229
+ </head>
230
+ <body>
231
+ ''']
232
+
233
+ current_page = 0
234
+
235
+ for elem in self.elements:
236
+ # Add page marker
237
+ if elem.page != current_page:
238
+ current_page = elem.page
239
+ html_parts.append(f'<div class="page-marker" id="page-{current_page}">Page {current_page}</div>')
240
+
241
+ if elem.type == "heading":
242
+ level = elem.level or 2
243
+ elem_id = self._get_element_id('heading')
244
+ content = html.escape(elem.content)
245
+ html_parts.append(f'<h{level} id="{elem_id}" data-page="{elem.page}">{content}</h{level}>')
246
+
247
+ elif elem.type == "paragraph":
248
+ elem_id = self._get_element_id('paragraph')
249
+ content = html.escape(elem.content)
250
+ html_parts.append(f'<p id="{elem_id}" data-page="{elem.page}">{content}</p>')
251
+
252
+ elif elem.type == "list":
253
+ elem_id = self._get_element_id('list-item')
254
+ content = html.escape(elem.content)
255
+ html_parts.append(f'<li id="{elem_id}" data-page="{elem.page}">{content}</li>')
256
+
257
+ elif elem.type == "table":
258
+ elem_id = self._get_element_id('table')
259
+ html_parts.append(f'<table id="{elem_id}" data-page="{elem.page}">')
260
+ for i, row in enumerate(elem.content):
261
+ row_id = self._get_element_id('table-row')
262
+ html_parts.append(f'<tr id="{row_id}">')
263
+ tag = 'th' if i == 0 else 'td'
264
+ for j, cell in enumerate(row):
265
+ cell_id = self._get_element_id('table-cell')
266
+ cell_content = html.escape(str(cell)) if cell else ""
267
+ html_parts.append(f'<{tag} id="{cell_id}">{cell_content}</{tag}>')
268
+ html_parts.append('</tr>')
269
+ html_parts.append('</table>')
270
+
271
+ html_parts.append('</body></html>')
272
+ return '\n'.join(html_parts)
273
 
 
 
274
 
275
+ # Streamlit App
276
+ def main():
277
+ st.set_page_config(
278
+ page_title="PDF to HTML Converter",
279
+ page_icon="πŸ“„",
280
+ layout="wide"
281
+ )
282
+
283
+ st.title("πŸ“„ PDF to HTML Converter")
284
+ st.markdown("Extract PDF content and view as structured HTML")
285
+
286
+ # Initialize session state
287
+ if 'processor' not in st.session_state:
288
+ st.session_state.processor = None
289
+ if 'html_content' not in st.session_state:
290
+ st.session_state.html_content = None
291
+
292
+ # Input section
293
+ pdf_url = st.text_input(
294
+ "Enter PDF URL",
295
+ placeholder="https://example.com/document.pdf",
296
+ help="Enter the URL of the PDF you want to process"
297
+ )
298
+
299
+ if st.button("Process PDF", type="primary"):
300
+ if not pdf_url:
301
+ st.error("Please enter a PDF URL")
302
+ else:
303
+ with st.spinner("Processing PDF..."):
304
+ try:
305
+ processor = PDFProcessor()
306
+ summary = processor.process_pdf(pdf_url)
307
+
308
+ st.session_state.processor = processor
309
+ st.session_state.html_content = processor.html_content
310
+
311
+ st.success(f"βœ… PDF processed successfully! ({summary['total_elements']} elements extracted)")
312
+
313
+ except Exception as e:
314
+ st.error(f"❌ Error processing PDF: {str(e)}")
315
+
316
+ # Display HTML in iframe
317
+ if st.session_state.html_content:
318
+ st.markdown("---")
319
+ st.subheader("πŸ“‹ Extracted HTML Content")
320
 
321
+ # Create tabs for different views
322
+ tab1, tab2 = st.tabs(["HTML Preview", "HTML Source"])
323
+
324
+ with tab1:
325
+ # Display in iframe
326
+ st.components.v1.html(
327
+ st.session_state.html_content,
328
+ height=800,
329
+ scrolling=True
330
+ )
331
+
332
+ with tab2:
333
+ # Show source code
334
+ st.code(st.session_state.html_content, language='html')
335
 
336
+ # Download button
337
+ st.download_button(
338
+ label="πŸ“₯ Download HTML",
339
+ data=st.session_state.html_content,
340
+ file_name="extracted_content.html",
341
+ mime="text/html"
342
+ )
343
 
344
+
345
+ if __name__ == "__main__":
346
+ main()