Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,8 +17,11 @@ model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwrit
|
|
| 17 |
reader = easyocr.Reader(['en'])
|
| 18 |
|
| 19 |
# ----------------- HTML Parsing -----------------
|
|
|
|
|
|
|
| 20 |
def parse_html_to_json(html_file):
|
| 21 |
-
"""
|
|
|
|
| 22 |
if hasattr(html_file, "read"):
|
| 23 |
html_content = html_file.read()
|
| 24 |
if isinstance(html_content, bytes):
|
|
@@ -34,11 +37,17 @@ def parse_html_to_json(html_file):
|
|
| 34 |
line_height = 20
|
| 35 |
char_width = 10
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if not text:
|
| 40 |
continue
|
| 41 |
|
|
|
|
| 42 |
line_words = text.split()
|
| 43 |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
|
| 44 |
|
|
@@ -46,16 +55,9 @@ def parse_html_to_json(html_file):
|
|
| 46 |
x_offset = 0
|
| 47 |
for word in line_words:
|
| 48 |
word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
"confidence": 1.0
|
| 53 |
-
})
|
| 54 |
-
words_json.append({
|
| 55 |
-
"text": word,
|
| 56 |
-
"bbox": word_bbox,
|
| 57 |
-
"confidence": 1.0
|
| 58 |
-
})
|
| 59 |
x_offset += char_width * (len(word) + 1)
|
| 60 |
|
| 61 |
paragraphs_json.append({
|
|
|
|
| 17 |
reader = easyocr.Reader(['en'])
|
| 18 |
|
| 19 |
# ----------------- HTML Parsing -----------------
|
| 20 |
+
from bs4 import BeautifulSoup
|
| 21 |
+
|
| 22 |
def parse_html_to_json(html_file):
|
| 23 |
+
"""Parse HTML and extract words/paragraphs JSON compatible with image OCR output."""
|
| 24 |
+
# Read content depending on object type
|
| 25 |
if hasattr(html_file, "read"):
|
| 26 |
html_content = html_file.read()
|
| 27 |
if isinstance(html_content, bytes):
|
|
|
|
| 37 |
line_height = 20
|
| 38 |
char_width = 10
|
| 39 |
|
| 40 |
+
# Iterate over all text elements inside the body
|
| 41 |
+
body = soup.body
|
| 42 |
+
if not body:
|
| 43 |
+
body = soup # fallback if <body> missing
|
| 44 |
+
|
| 45 |
+
for element in body.find_all(text=True):
|
| 46 |
+
text = element.strip()
|
| 47 |
if not text:
|
| 48 |
continue
|
| 49 |
|
| 50 |
+
# Split into words
|
| 51 |
line_words = text.split()
|
| 52 |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
|
| 53 |
|
|
|
|
| 55 |
x_offset = 0
|
| 56 |
for word in line_words:
|
| 57 |
word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
|
| 58 |
+
word_entry = {"text": word, "bbox": word_bbox, "confidence": 1.0}
|
| 59 |
+
word_entries.append(word_entry)
|
| 60 |
+
words_json.append(word_entry)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
x_offset += char_width * (len(word) + 1)
|
| 62 |
|
| 63 |
paragraphs_json.append({
|