ShayanRl commited on
Commit
5405dd5
·
verified ·
1 Parent(s): 6e1396f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -334
app.py CHANGED
@@ -1,346 +1,57 @@
1
  import streamlit as st
2
- import pdfplumber
3
- import pandas as pd
4
- from bs4 import BeautifulSoup
5
- import re
6
- import json
7
- import requests
8
- import tempfile
9
- import os
10
- from typing import List, Dict, Any, Union
11
- from urllib.parse import urlparse, unquote
12
- import html
13
-
14
- try:
15
- import fitz # PyMuPDF
16
- PYMUPDF_AVAILABLE = True
17
- except ImportError:
18
- PYMUPDF_AVAILABLE = False
19
- print("PyMuPDF not available, using pdfplumber only")
20
 
21
- from lxml import html as lxml_html, etree
22
- from dataclasses import dataclass, asdict
23
 
 
 
24
 
25
- @dataclass
26
- class PDFElement:
27
- """Represents an element extracted from PDF"""
28
- type: str
29
- content: Any
30
- page: int
31
- bbox: tuple = None
32
- style: Dict = None
33
- level: int = None
34
 
 
 
35
 
36
- class PDFProcessor:
37
- """Simplified PDF processor"""
38
-
39
- def __init__(self):
40
- self.elements = []
41
- self.html_content = ""
42
- self.element_counter = 0
43
-
44
- def process_pdf(self, pdf_url: str) -> Dict:
45
- """Process PDF from URL"""
46
- temp_file = None
47
-
48
- try:
49
- temp_file = self._download_pdf(pdf_url)
50
-
51
- # Extract content
52
- self.elements = self._extract_content(temp_file)
53
- self.html_content = self._convert_to_html()
54
-
55
- # Get summary
56
- summary = {
57
- 'total_elements': len(self.elements),
58
- 'pages': max([e.page for e in self.elements]) if self.elements else 0,
59
- 'headings': len([e for e in self.elements if e.type == 'heading']),
60
- 'tables': len([e for e in self.elements if e.type == 'table']),
61
- 'paragraphs': len([e for e in self.elements if e.type == 'paragraph'])
62
- }
63
-
64
- return summary
65
-
66
- finally:
67
- if temp_file and os.path.exists(temp_file):
68
- try:
69
- os.unlink(temp_file)
70
- except:
71
- pass
72
-
73
- def _download_pdf(self, url: str) -> str:
74
- """Download PDF from URL"""
75
- headers = {
76
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
77
- }
78
-
79
- response = requests.get(url, headers=headers, timeout=30)
80
- response.raise_for_status()
81
-
82
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
83
- temp_file.write(response.content)
84
- temp_file.close()
85
-
86
- return temp_file.name
87
-
88
- def _get_element_id(self, element_type: str) -> str:
89
- """Generate unique ID for element"""
90
- self.element_counter += 1
91
- return f"{element_type}-{self.element_counter}"
92
-
93
- def _extract_content(self, pdf_path: str) -> List[PDFElement]:
94
- """Extract structured content from PDF"""
95
- elements = []
96
-
97
- if PYMUPDF_AVAILABLE:
98
- try:
99
- # Try PyMuPDF first for better structure detection
100
- doc = fitz.open(pdf_path)
101
-
102
- for page_num, page in enumerate(doc, 1):
103
- blocks = page.get_text("dict")
104
-
105
- for block in blocks["blocks"]:
106
- if block["type"] == 0: # Text block
107
- for line in block["lines"]:
108
- for span in line["spans"]:
109
- text = span["text"].strip()
110
- if not text:
111
- continue
112
-
113
- font_size = span["size"]
114
-
115
- # Simple classification
116
- if font_size > 14:
117
- element_type = "heading"
118
- level = 1 if font_size > 18 else 2
119
- elif re.match(r'^[\d\-\•\*]+\.?\s+', text):
120
- element_type = "list"
121
- level = None
122
- else:
123
- element_type = "paragraph"
124
- level = None
125
-
126
- elements.append(PDFElement(
127
- type=element_type,
128
- content=text,
129
- page=page_num,
130
- level=level
131
- ))
132
-
133
- doc.close()
134
-
135
- # Also get tables with pdfplumber
136
- with pdfplumber.open(pdf_path) as pdf:
137
- for page_num, page in enumerate(pdf.pages, 1):
138
- tables = page.extract_tables()
139
- for table in tables:
140
- if table:
141
- elements.append(PDFElement(
142
- type="table",
143
- content=table,
144
- page=page_num
145
- ))
146
-
147
- return elements
148
-
149
- except Exception as e:
150
- print(f"PyMuPDF failed: {e}, falling back to pdfplumber")
151
-
152
- # Fallback to pdfplumber only
153
- with pdfplumber.open(pdf_path) as pdf:
154
- for page_num, page in enumerate(pdf.pages, 1):
155
- text = page.extract_text() or ""
156
- lines = text.split('\n')
157
-
158
- for line in lines:
159
- line = line.strip()
160
- if not line:
161
- continue
162
-
163
- if line.isupper() and len(line) < 100:
164
- element_type = "heading"
165
- level = 1
166
- elif re.match(r'^[\d\-\•\*]+\.?\s+', line):
167
- element_type = "list"
168
- level = None
169
- else:
170
- element_type = "paragraph"
171
- level = None
172
-
173
- elements.append(PDFElement(
174
- type=element_type,
175
- content=line,
176
- page=page_num,
177
- level=level
178
- ))
179
-
180
- # Extract tables
181
- tables = page.extract_tables()
182
  for table in tables:
183
- if table:
184
- elements.append(PDFElement(
185
- type="table",
186
- content=table,
187
- page=page_num
188
- ))
189
-
190
- return elements
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- def _convert_to_html(self) -> str:
193
- """Convert elements to HTML with IDs"""
194
- html_parts = ['''
195
- <!DOCTYPE html>
196
- <html>
197
- <head>
198
- <meta charset="UTF-8">
199
- <style>
200
- body {
201
- font-family: Arial, sans-serif;
202
- line-height: 1.6;
203
- padding: 20px;
204
- max-width: 900px;
205
- margin: 0 auto;
206
- }
207
- h1, h2, h3 { color: #333; margin-top: 20px; }
208
- table {
209
- border-collapse: collapse;
210
- width: 100%;
211
- margin: 20px 0;
212
- }
213
- th, td {
214
- border: 1px solid #ddd;
215
- padding: 8px;
216
- text-align: left;
217
- }
218
- th { background-color: #f2f2f2; }
219
- p { margin: 10px 0; }
220
- li { margin: 5px 0; }
221
- .page-marker {
222
- color: #888;
223
- font-size: 0.9em;
224
- margin-top: 30px;
225
- padding-top: 10px;
226
- border-top: 2px solid #eee;
227
- }
228
- </style>
229
- </head>
230
- <body>
231
- ''']
232
-
233
- current_page = 0
234
-
235
- for elem in self.elements:
236
- # Add page marker
237
- if elem.page != current_page:
238
- current_page = elem.page
239
- html_parts.append(f'<div class="page-marker" id="page-{current_page}">Page {current_page}</div>')
240
-
241
- if elem.type == "heading":
242
- level = elem.level or 2
243
- elem_id = self._get_element_id('heading')
244
- content = html.escape(elem.content)
245
- html_parts.append(f'<h{level} id="{elem_id}" data-page="{elem.page}">{content}</h{level}>')
246
-
247
- elif elem.type == "paragraph":
248
- elem_id = self._get_element_id('paragraph')
249
- content = html.escape(elem.content)
250
- html_parts.append(f'<p id="{elem_id}" data-page="{elem.page}">{content}</p>')
251
-
252
- elif elem.type == "list":
253
- elem_id = self._get_element_id('list-item')
254
- content = html.escape(elem.content)
255
- html_parts.append(f'<li id="{elem_id}" data-page="{elem.page}">{content}</li>')
256
-
257
- elif elem.type == "table":
258
- elem_id = self._get_element_id('table')
259
- html_parts.append(f'<table id="{elem_id}" data-page="{elem.page}">')
260
- for i, row in enumerate(elem.content):
261
- row_id = self._get_element_id('table-row')
262
- html_parts.append(f'<tr id="{row_id}">')
263
- tag = 'th' if i == 0 else 'td'
264
- for j, cell in enumerate(row):
265
- cell_id = self._get_element_id('table-cell')
266
- cell_content = html.escape(str(cell)) if cell else ""
267
- html_parts.append(f'<{tag} id="{cell_id}">{cell_content}</{tag}>')
268
- html_parts.append('</tr>')
269
- html_parts.append('</table>')
270
-
271
- html_parts.append('</body></html>')
272
- return '\n'.join(html_parts)
273
 
274
 
275
- # Streamlit App
276
- def main():
277
- st.set_page_config(
278
- page_title="PDF to HTML Converter",
279
- page_icon="📄",
280
- layout="wide"
281
- )
282
-
283
- st.title("📄 PDF to HTML Converter")
284
- st.markdown("Extract PDF content and view as structured HTML")
285
-
286
- # Initialize session state
287
- if 'processor' not in st.session_state:
288
- st.session_state.processor = None
289
- if 'html_content' not in st.session_state:
290
- st.session_state.html_content = None
291
-
292
- # Input section
293
- pdf_url = st.text_input(
294
- "Enter PDF URL",
295
- placeholder="https://example.com/document.pdf",
296
- help="Enter the URL of the PDF you want to process"
297
- )
298
-
299
- if st.button("Process PDF", type="primary"):
300
- if not pdf_url:
301
- st.error("Please enter a PDF URL")
302
- else:
303
- with st.spinner("Processing PDF..."):
304
- try:
305
- processor = PDFProcessor()
306
- summary = processor.process_pdf(pdf_url)
307
-
308
- st.session_state.processor = processor
309
- st.session_state.html_content = processor.html_content
310
-
311
- st.success(f"✅ PDF processed successfully! ({summary['total_elements']} elements extracted)")
312
-
313
- except Exception as e:
314
- st.error(f"❌ Error processing PDF: {str(e)}")
315
-
316
- # Display HTML in iframe
317
- if st.session_state.html_content:
318
- st.markdown("---")
319
- st.subheader("📋 Extracted HTML Content")
320
-
321
- # Create tabs for different views
322
- tab1, tab2 = st.tabs(["HTML Preview", "HTML Source"])
323
-
324
- with tab1:
325
- # Display in iframe
326
- st.components.v1.html(
327
- st.session_state.html_content,
328
- height=800,
329
- scrolling=True
330
- )
331
-
332
- with tab2:
333
- # Show source code
334
- st.code(st.session_state.html_content, language='html')
335
-
336
- # Download button
337
- st.download_button(
338
- label="📥 Download HTML",
339
- data=st.session_state.html_content,
340
- file_name="extracted_content.html",
341
- mime="text/html"
342
- )
343
 
 
 
 
 
 
 
344
 
345
- if __name__ == "__main__":
346
- main()
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ import io
 
4
 
5
+ import requests
6
+ import pdfplumber
7
 
 
 
 
 
 
 
 
 
 
8
 
9
+ def fextractURL(pdf_path):
10
+ extracted_data = ""
11
 
12
+ if pdf_path.endswith('.pdf'):
13
+ # If the URL ends with .pdf, use pdfplumber directly
14
+ r = requests.get(pdf_path)
15
+ f = io.BytesIO(r.content)
16
+ with pdfplumber.open(f) as pdf:
17
+ for page in pdf.pages:
18
+ extracted_data += page.extract_text() + "\n" # Extract text
19
+ tables = page.extract_tables() # Extract tables
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  for table in tables:
21
+ for row in table:
22
+ extracted_data += "\t".join(str(cell) for cell in row) + "\n"
23
+ else:
24
+ # If the URL does not end with .pdf, download the PDF first
25
+ response = requests.get(pdf_path)
26
+ pdf_content = response.content
27
+
28
+ # Save the PDF locally
29
+ pdf_filename = 'downloaded_document.pdf'
30
+ with open(pdf_filename, 'wb') as pdf_file:
31
+ pdf_file.write(pdf_content)
32
+
33
+ # Extract content using pdfplumber
34
+ with pdfplumber.open(pdf_filename) as pdf:
35
+ for page in pdf.pages:
36
+ extracted_data += page.extract_text() + "\n" # Extract text
37
+ tables = page.extract_tables() # Extract tables
38
+ for table in tables:
39
+ for row in table:
40
+ extracted_data += "\t".join(str(cell) for cell in row) + "\n"
41
+
42
+ # Delete the PDF file
43
+
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ return extracted_data
49
+
50
+
51
+ vert_space = '<div style="padding: 3rem 1rem;"></div>'
52
+ st.markdown(vert_space, unsafe_allow_html=True)
53
+ st.write("Extarct full text from PDF url")
54
 
55
+ pdfURL = st.text_input(label="origin URL", value="", max_chars=None, key=None, type="default", help=None, autocomplete=None, on_change=None, args=None, kwargs=None, placeholder=None, disabled=False, label_visibility="visible")
56
+ button = st.button(label='Extract', key=None, help=None, on_click=None, args=None, kwargs=None, type="secondary", disabled=False, use_container_width=False)
57
+ extractedText = st.empty()