Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,40 +23,42 @@ from bs4 import BeautifulSoup
|
|
| 23 |
|
| 24 |
def parse_html_to_json(html_file):
|
| 25 |
"""
|
| 26 |
-
|
| 27 |
-
|
| 28 |
"""
|
| 29 |
-
# Handle Gradio NamedString, str, or file-like object
|
| 30 |
html_content = ""
|
| 31 |
-
if hasattr(html_file, "read"): # real file
|
| 32 |
-
html_content = html_file.read()
|
| 33 |
-
if isinstance(html_content, bytes):
|
| 34 |
-
html_content = html_content.decode("utf-8")
|
| 35 |
-
elif isinstance(html_file, str):
|
| 36 |
-
html_content = html_file
|
| 37 |
-
else: # Gradio NamedString
|
| 38 |
-
html_content = getattr(html_file, "name", str(html_file))
|
| 39 |
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
|
|
|
| 42 |
words_json = []
|
| 43 |
paragraphs_json = []
|
| 44 |
y_offset = 0
|
| 45 |
line_height = 20
|
| 46 |
char_width = 10
|
| 47 |
|
| 48 |
-
# iterate over all visible text nodes in the body
|
| 49 |
body = soup.body
|
| 50 |
if not body:
|
| 51 |
-
body = soup
|
| 52 |
|
| 53 |
-
#
|
| 54 |
for element in body.find_all(text=True):
|
| 55 |
text = element.strip()
|
| 56 |
if not text:
|
| 57 |
continue
|
| 58 |
|
| 59 |
-
# split into words
|
| 60 |
line_words = text.split()
|
| 61 |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
|
| 62 |
|
|
@@ -74,14 +76,12 @@ def parse_html_to_json(html_file):
|
|
| 74 |
"bbox": line_bbox,
|
| 75 |
"words": word_entries
|
| 76 |
})
|
| 77 |
-
|
| 78 |
y_offset += line_height
|
| 79 |
|
| 80 |
output_json = {
|
| 81 |
"words": words_json,
|
| 82 |
"paragraphs": paragraphs_json
|
| 83 |
}
|
| 84 |
-
|
| 85 |
return output_json
|
| 86 |
|
| 87 |
# ----------------- Image Loading -----------------
|
|
|
|
| 23 |
|
| 24 |
def parse_html_to_json(html_file):
|
| 25 |
"""
|
| 26 |
+
Properly parse HTML file uploaded via Gradio.
|
| 27 |
+
Returns JSON with words and paragraphs like image OCR output.
|
| 28 |
"""
|
|
|
|
| 29 |
html_content = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
try:
|
| 32 |
+
# Gradio gives a temp file path string for uploaded files
|
| 33 |
+
if isinstance(html_file, str):
|
| 34 |
+
with open(html_file, "r", encoding="utf-8") as f:
|
| 35 |
+
html_content = f.read()
|
| 36 |
+
elif hasattr(html_file, "read"): # file-like object
|
| 37 |
+
html_content = html_file.read()
|
| 38 |
+
if isinstance(html_content, bytes):
|
| 39 |
+
html_content = html_content.decode("utf-8")
|
| 40 |
+
else:
|
| 41 |
+
html_content = str(html_file)
|
| 42 |
+
except Exception as e:
|
| 43 |
+
return {"error": f"Cannot read HTML file: {e}"}
|
| 44 |
|
| 45 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 46 |
words_json = []
|
| 47 |
paragraphs_json = []
|
| 48 |
y_offset = 0
|
| 49 |
line_height = 20
|
| 50 |
char_width = 10
|
| 51 |
|
|
|
|
| 52 |
body = soup.body
|
| 53 |
if not body:
|
| 54 |
+
body = soup
|
| 55 |
|
| 56 |
+
# iterate over all visible text nodes
|
| 57 |
for element in body.find_all(text=True):
|
| 58 |
text = element.strip()
|
| 59 |
if not text:
|
| 60 |
continue
|
| 61 |
|
|
|
|
| 62 |
line_words = text.split()
|
| 63 |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
|
| 64 |
|
|
|
|
| 76 |
"bbox": line_bbox,
|
| 77 |
"words": word_entries
|
| 78 |
})
|
|
|
|
| 79 |
y_offset += line_height
|
| 80 |
|
| 81 |
output_json = {
|
| 82 |
"words": words_json,
|
| 83 |
"paragraphs": paragraphs_json
|
| 84 |
}
|
|
|
|
| 85 |
return output_json
|
| 86 |
|
| 87 |
# ----------------- Image Loading -----------------
|