Spaces:

rahul7star
/

OCR

Paused

App Files Files Community

rahul7star commited on Aug 29, 2025

Commit

eeb0aa9

verified ·

1 Parent(s): 065f9a9

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -45

app.py CHANGED Viewed

@@ -16,10 +16,10 @@ processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 reader = easyocr.Reader(['en'])
-def extract_images_from_html(html_file):
-    """Extract images from HTML file (base64 or URLs)"""
     images = []
-    soup = BeautifulSoup(html_file.read(), "html.parser")
     for img_tag in soup.find_all("img"):
         src = img_tag.get("src")
         if not src:
@@ -39,7 +39,18 @@ def extract_images_from_html(html_file):
 def parse_html_text(html_file):
     """Parse HTML text and generate approximate bounding boxes"""
-    html_content = html_file.read().decode("utf-8")
     soup = BeautifulSoup(html_content, "html.parser")
     body_text = soup.get_text(separator="\n")
     lines = [line.strip() for line in body_text.split("\n") if line.strip()]
@@ -79,10 +90,11 @@ def parse_html_text(html_file):
     output_json = {
         "words": words_json,
-        "lines": lines_json
     }
-    return html_content, output_json
 def load_image(image_file, image_url):
     if image_file:
@@ -95,61 +107,59 @@ def load_image(image_file, image_url):
 def detect_text_combined(image_file, image_url, html_file):
     # HTML path
     if html_file:
-        html_content, output_json = parse_html_text(html_file)
         json_str = json.dumps(output_json, indent=2)
         tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
         tmp_file.write(json_str)
         tmp_file.close()
-        return html_content, json_str, tmp_file.name
     # Image path
     images = load_image(image_file, image_url)
     if not images:
         return None, "No input provided.", None
-    all_output_json = []
-    annotated_images = []
-    for image in images:
-        results = reader.readtext(np.array(image))
-        draw = ImageDraw.Draw(image)
-        words_json = []
-        for bbox, _, conf in results:
-            x_coords = [float(point[0]) for point in bbox]
-            y_coords = [float(point[1]) for point in bbox]
-            x_min, y_min = min(x_coords), min(y_coords)
-            x_max, y_max = max(x_coords), max(y_coords)
-            # Crop word for TrOCR recognition
-            word_crop = image.crop((x_min, y_min, x_max, y_max))
-            pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
-            generated_ids = model.generate(pixel_values)
-            text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
-            words_json.append({
-                "text": text,
-                "bbox": [x_min, y_min, x_max, y_max],
-                "confidence": float(conf)
-            })
-        paragraphs_json = words_json.copy()
-        output_json = {
-            "words": words_json,
-            "paragraphs": paragraphs_json
-        }
-        json_str = json.dumps(output_json, indent=2)
-        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
-        tmp_file.write(json_str)
-        tmp_file.close()
-        annotated_images.append((image, json_str, tmp_file.name))
-    # Return first image for simplicity (can extend to gallery)
-    return annotated_images[0]
 iface = gr.Interface(
     fn=detect_text_combined,

 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 reader = easyocr.Reader(['en'])
+def extract_images_from_html(html_content):
+    """Extract images from HTML content (base64 or URLs)"""
     images = []
+    soup = BeautifulSoup(html_content, "html.parser")
     for img_tag in soup.find_all("img"):
         src = img_tag.get("src")
         if not src:
 def parse_html_text(html_file):
     """Parse HTML text and generate approximate bounding boxes"""
+    # Handle different Gradio file types
+    if hasattr(html_file, "read"):
+        html_content = html_file.read()
+        if isinstance(html_content, bytes):
+            html_content = html_content.decode("utf-8")
+    else:
+        # NamedString object (Gradio v3.40+)
+        html_content = str(html_file)
+    # Extract images from HTML (optional, for OCR later)
+    images_in_html = extract_images_from_html(html_content)
     soup = BeautifulSoup(html_content, "html.parser")
     body_text = soup.get_text(separator="\n")
     lines = [line.strip() for line in body_text.split("\n") if line.strip()]
     output_json = {
         "words": words_json,
+        "lines": lines_json,
+        "images_found": len(images_in_html)
     }
+    return html_content, output_json, images_in_html
 def load_image(image_file, image_url):
     if image_file:
 def detect_text_combined(image_file, image_url, html_file):
     # HTML path
     if html_file:
+        html_content, output_json, images_in_html = parse_html_text(html_file)
         json_str = json.dumps(output_json, indent=2)
         tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
         tmp_file.write(json_str)
         tmp_file.close()
+        annotated_image = None
+        if images_in_html:
+            # For demo, show first extracted image if exists
+            annotated_image = images_in_html[0]
+        return annotated_image, json_str, tmp_file.name
     # Image path
     images = load_image(image_file, image_url)
     if not images:
         return None, "No input provided.", None
+    annotated_image = images[0]
+    image = annotated_image
+    results = reader.readtext(np.array(image))
+    draw = ImageDraw.Draw(image)
+    words_json = []
+    for bbox, _, conf in results:
+        x_coords = [float(point[0]) for point in bbox]
+        y_coords = [float(point[1]) for point in bbox]
+        x_min, y_min = min(x_coords), min(y_coords)
+        x_max, y_max = max(x_coords), max(y_coords)
+        # Crop word for TrOCR recognition
+        word_crop = image.crop((x_min, y_min, x_max, y_max))
+        pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
+        generated_ids = model.generate(pixel_values)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
+        words_json.append({
+            "text": text,
+            "bbox": [x_min, y_min, x_max, y_max],
+            "confidence": float(conf)
+        })
+    paragraphs_json = words_json.copy()
+    output_json = {
+        "words": words_json,
+        "paragraphs": paragraphs_json
+    }
+    json_str = json.dumps(output_json, indent=2)
+    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
+    tmp_file.write(json_str)
+    tmp_file.close()
+    return annotated_image, json_str, tmp_file.name
 iface = gr.Interface(
     fn=detect_text_combined,