Spaces:

rahul7star
/

OCR

Sleeping

App Files Files Community

rahul7star commited on Aug 29, 2025

Commit

35f0997

verified ·

1 Parent(s): 240048f

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -18

app.py CHANGED Viewed

@@ -23,40 +23,42 @@ from bs4 import BeautifulSoup
 def parse_html_to_json(html_file):
     """
-    Parse HTML content from a Gradio file input or string and produce
-    words/paragraphs JSON compatible with image OCR output.
     """
-    # Handle Gradio NamedString, str, or file-like object
     html_content = ""
-    if hasattr(html_file, "read"):  # real file
-        html_content = html_file.read()
-        if isinstance(html_content, bytes):
-            html_content = html_content.decode("utf-8")
-    elif isinstance(html_file, str):
-        html_content = html_file
-    else:  # Gradio NamedString
-        html_content = getattr(html_file, "name", str(html_file))
-    soup = BeautifulSoup(html_content, "html.parser")
     words_json = []
     paragraphs_json = []
     y_offset = 0
     line_height = 20
     char_width = 10
-    # iterate over all visible text nodes in the body
     body = soup.body
     if not body:
-        body = soup  # fallback
-    # Only consider visible text
     for element in body.find_all(text=True):
         text = element.strip()
         if not text:
             continue
-        # split into words
         line_words = text.split()
         line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
@@ -74,14 +76,12 @@ def parse_html_to_json(html_file):
             "bbox": line_bbox,
             "words": word_entries
         })
         y_offset += line_height
     output_json = {
         "words": words_json,
         "paragraphs": paragraphs_json
     }
     return output_json
 # ----------------- Image Loading -----------------

 def parse_html_to_json(html_file):
     """
+    Properly parse HTML file uploaded via Gradio.
+    Returns JSON with words and paragraphs like image OCR output.
     """
     html_content = ""
+    try:
+        # Gradio gives a temp file path string for uploaded files
+        if isinstance(html_file, str):
+            with open(html_file, "r", encoding="utf-8") as f:
+                html_content = f.read()
+        elif hasattr(html_file, "read"):  # file-like object
+            html_content = html_file.read()
+            if isinstance(html_content, bytes):
+                html_content = html_content.decode("utf-8")
+        else:
+            html_content = str(html_file)
+    except Exception as e:
+        return {"error": f"Cannot read HTML file: {e}"}
+    soup = BeautifulSoup(html_content, "html.parser")
     words_json = []
     paragraphs_json = []
     y_offset = 0
     line_height = 20
     char_width = 10
     body = soup.body
     if not body:
+        body = soup
+    # iterate over all visible text nodes
     for element in body.find_all(text=True):
         text = element.strip()
         if not text:
             continue
         line_words = text.split()
         line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height]
             "bbox": line_bbox,
             "words": word_entries
         })
         y_offset += line_height
     output_json = {
         "words": words_json,
         "paragraphs": paragraphs_json
     }
     return output_json
 # ----------------- Image Loading -----------------