Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from PIL import Image, ImageDraw | |
| import requests | |
| from io import BytesIO | |
| import numpy as np | |
| import json | |
| import tempfile | |
| import easyocr | |
| from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
| from bs4 import BeautifulSoup | |
| import base64 | |
| import re | |
| # ----------------- Initialize OCR ----------------- | |
| processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") | |
| model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") | |
| reader = easyocr.Reader(['en']) | |
| # ----------------- HTML Parsing ----------------- | |
| from bs4 import BeautifulSoup | |
| from bs4 import BeautifulSoup | |
| def parse_html_to_json(html_file): | |
| """ | |
| Properly parse HTML file uploaded via Gradio. | |
| Returns JSON with words and paragraphs like image OCR output. | |
| """ | |
| html_content = "" | |
| try: | |
| # Gradio gives a temp file path string for uploaded files | |
| if isinstance(html_file, str): | |
| with open(html_file, "r", encoding="utf-8") as f: | |
| html_content = f.read() | |
| elif hasattr(html_file, "read"): # file-like object | |
| html_content = html_file.read() | |
| if isinstance(html_content, bytes): | |
| html_content = html_content.decode("utf-8") | |
| else: | |
| html_content = str(html_file) | |
| except Exception as e: | |
| return {"error": f"Cannot read HTML file: {e}"} | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| words_json = [] | |
| paragraphs_json = [] | |
| y_offset = 0 | |
| line_height = 20 | |
| char_width = 10 | |
| body = soup.body | |
| if not body: | |
| body = soup | |
| # iterate over all visible text nodes | |
| for element in body.find_all(text=True): | |
| text = element.strip() | |
| if not text: | |
| continue | |
| line_words = text.split() | |
| line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height] | |
| word_entries = [] | |
| x_offset = 0 | |
| for word in line_words: | |
| word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height] | |
| word_entry = {"text": word, "bbox": word_bbox, "confidence": 1.0} | |
| word_entries.append(word_entry) | |
| words_json.append(word_entry) | |
| x_offset += char_width * (len(word) + 1) | |
| paragraphs_json.append({ | |
| "text": text, | |
| "bbox": line_bbox, | |
| "words": word_entries | |
| }) | |
| y_offset += line_height | |
| output_json = { | |
| "words": words_json, | |
| "paragraphs": paragraphs_json | |
| } | |
| return output_json | |
| # ----------------- Image Loading ----------------- | |
| def load_image(image_file, image_url): | |
| if image_file: | |
| return [image_file] | |
| elif image_url: | |
| response = requests.get(image_url) | |
| return [Image.open(BytesIO(response.content)).convert("RGB")] | |
| return [] | |
| # ----------------- Main Logic ----------------- | |
| def detect_text_combined(image_file, image_url, html_file): | |
| # ----------------- HTML Path ----------------- | |
| if html_file: | |
| output_json = parse_html_to_json(html_file) | |
| json_str = json.dumps(output_json, indent=2) | |
| tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") | |
| tmp_file.write(json_str) | |
| tmp_file.close() | |
| annotated_image = None | |
| return annotated_image, json_str, tmp_file.name | |
| # ----------------- Image Path ----------------- | |
| images = load_image(image_file, image_url) | |
| if not images: | |
| return None, "No input provided.", None | |
| annotated_image = images[0] | |
| image = annotated_image | |
| results = reader.readtext(np.array(image)) | |
| draw = ImageDraw.Draw(image) | |
| words_json = [] | |
| for bbox, _, conf in results: | |
| x_coords = [float(point[0]) for point in bbox] | |
| y_coords = [float(point[1]) for point in bbox] | |
| x_min, y_min = min(x_coords), min(y_coords) | |
| x_max, y_max = max(x_coords), max(y_coords) | |
| # Crop word for TrOCR recognition | |
| word_crop = image.crop((x_min, y_min, x_max, y_max)) | |
| pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values | |
| generated_ids = model.generate(pixel_values) | |
| text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2) | |
| words_json.append({ | |
| "text": text, | |
| "bbox": [x_min, y_min, x_max, y_max], | |
| "confidence": float(conf) | |
| }) | |
| paragraphs_json = words_json.copy() | |
| output_json = { | |
| "words": words_json, | |
| "paragraphs": paragraphs_json | |
| } | |
| json_str = json.dumps(output_json, indent=2) | |
| tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") | |
| tmp_file.write(json_str) | |
| tmp_file.close() | |
| return annotated_image, json_str, tmp_file.name | |
| # ----------------- Gradio Interface ----------------- | |
| iface = gr.Interface( | |
| fn=detect_text_combined, | |
| inputs=[ | |
| gr.Image(type="pil", label="Upload Image"), | |
| gr.Textbox(label="Image URL (optional)"), | |
| gr.File(label="Upload HTML File", file_types=[".html", ".htm"]) | |
| ], | |
| outputs=[ | |
| gr.Image(type="pil", label="Annotated Image"), | |
| gr.Textbox(label="JSON Output"), | |
| gr.File(label="Download JSON") | |
| ], | |
| title="Combined OCR & HTML Text Bounding Box Extractor", | |
| description="Upload an image, provide an image URL, or upload an HTML file. Outputs word- and paragraph-level bounding boxes in JSON format consistent with image OCR output." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |