Update app.py
Browse files
app.py
CHANGED
|
@@ -590,6 +590,230 @@
|
|
| 590 |
|
| 591 |
|
| 592 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
import gradio as gr
|
| 594 |
import torch
|
| 595 |
import numpy as np
|
|
@@ -597,19 +821,29 @@ import cv2
|
|
| 597 |
from PIL import Image
|
| 598 |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
| 599 |
from paddleocr import PaddleOCR
|
|
|
|
| 600 |
|
| 601 |
-
#
|
| 602 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 603 |
print(f"Loading TrOCR on {device}...")
|
| 604 |
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
|
| 605 |
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
|
| 606 |
|
|
|
|
| 607 |
print("Loading PaddleOCR...")
|
|
|
|
| 608 |
detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
|
| 609 |
det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
|
| 610 |
|
| 611 |
-
|
| 612 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
x1 = max(box1[0], box2[0])
|
| 614 |
y1 = max(box1[1], box2[1])
|
| 615 |
x2 = min(box1[2], box2[2])
|
|
@@ -620,150 +854,156 @@ def calculate_iou(box1, box2):
|
|
| 620 |
|
| 621 |
intersection = (x2 - x1) * (y2 - y1)
|
| 622 |
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
| 623 |
-
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
| 624 |
|
| 625 |
-
return intersection /
|
| 626 |
|
| 627 |
-
def
|
| 628 |
-
"""
|
| 629 |
-
|
| 630 |
-
|
|
|
|
| 631 |
|
| 632 |
-
#
|
| 633 |
-
|
| 634 |
for b in boxes:
|
| 635 |
area = (b[2] - b[0]) * (b[3] - b[1])
|
| 636 |
-
|
| 637 |
|
| 638 |
-
# Sort by
|
| 639 |
-
|
| 640 |
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
|
|
|
| 644 |
curr_box = current[:4]
|
| 645 |
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
|
|
|
|
|
|
| 650 |
break
|
| 651 |
|
| 652 |
-
if
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
return
|
|
|
|
| 656 |
|
| 657 |
-
|
| 658 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 659 |
if raw_boxes is None or len(raw_boxes) == 0:
|
| 660 |
return []
|
| 661 |
-
|
| 662 |
-
# Convert
|
| 663 |
rects = []
|
| 664 |
for box in raw_boxes:
|
| 665 |
box = np.array(box).astype(np.float32)
|
| 666 |
x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
|
| 667 |
x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
|
| 668 |
rects.append([x1, y1, x2, y2])
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
# Sort by Y
|
| 677 |
-
rects.sort(key=lambda r: r[1])
|
| 678 |
-
|
| 679 |
-
# Group into lines based on Y overlap
|
| 680 |
lines = []
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
#
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
# Save current line and start new one
|
| 702 |
-
lines.append(current_line)
|
| 703 |
-
current_line = [rect]
|
| 704 |
-
|
| 705 |
-
lines.append(current_line)
|
| 706 |
-
|
| 707 |
-
# Merge boxes in each line
|
| 708 |
-
merged = []
|
| 709 |
-
for line in lines:
|
| 710 |
-
# Sort line boxes left to right
|
| 711 |
-
line.sort(key=lambda r: r[0])
|
| 712 |
-
|
| 713 |
-
# Merge horizontally close boxes
|
| 714 |
-
merged_line = [line[0]]
|
| 715 |
-
for rect in line[1:]:
|
| 716 |
-
last = merged_line[-1]
|
| 717 |
-
# If close horizontally, merge
|
| 718 |
-
if rect[0] - last[2] < x_gap_thresh:
|
| 719 |
-
merged_line[-1] = [
|
| 720 |
-
min(last[0], rect[0]),
|
| 721 |
-
min(last[1], rect[1]),
|
| 722 |
-
max(last[2], rect[2]),
|
| 723 |
-
max(last[3], rect[3])
|
| 724 |
-
]
|
| 725 |
else:
|
| 726 |
-
|
|
|
|
|
|
|
| 727 |
|
| 728 |
-
#
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
|
| 739 |
def process_image(image):
|
| 740 |
-
|
| 741 |
-
return None, [], "Please upload an image."
|
| 742 |
|
| 743 |
-
|
|
|
|
| 744 |
|
|
|
|
|
|
|
|
|
|
| 745 |
try:
|
| 746 |
dt_boxes, _ = detector.text_detector(image_np)
|
| 747 |
except Exception as e:
|
| 748 |
-
return image, [], f"Detection Error: {str(e)}"
|
| 749 |
-
|
| 750 |
if dt_boxes is None or len(dt_boxes) == 0:
|
| 751 |
-
return image, [], "No text detected."
|
| 752 |
|
| 753 |
-
|
|
|
|
| 754 |
|
| 755 |
annotated_img = image_np.copy()
|
| 756 |
results = []
|
| 757 |
debug_crops = []
|
| 758 |
|
| 759 |
-
|
|
|
|
|
|
|
|
|
|
| 760 |
x1, y1, x2, y2 = map(int, box)
|
| 761 |
|
|
|
|
|
|
|
|
|
|
| 762 |
if (x2 - x1) < 20 or (y2 - y1) < 15:
|
|
|
|
| 763 |
continue
|
| 764 |
-
|
|
|
|
| 765 |
cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
| 766 |
|
|
|
|
| 767 |
PAD = 10
|
| 768 |
h, w, _ = image_np.shape
|
| 769 |
x1 = max(0, x1 - PAD)
|
|
@@ -775,18 +1015,20 @@ def process_image(image):
|
|
| 775 |
pil_crop = Image.fromarray(crop)
|
| 776 |
debug_crops.append(pil_crop)
|
| 777 |
|
|
|
|
| 778 |
with torch.no_grad():
|
| 779 |
pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
|
| 780 |
generated_ids = model.generate(pixel_values)
|
| 781 |
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 782 |
if text.strip():
|
| 783 |
results.append(text)
|
| 784 |
-
|
| 785 |
full_text = "\n".join(results)
|
| 786 |
-
return Image.fromarray(annotated_img), debug_crops, full_text
|
| 787 |
|
|
|
|
| 788 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 789 |
-
gr.Markdown("# ⚡ Smart Line-Level OCR (
|
| 790 |
|
| 791 |
with gr.Row():
|
| 792 |
with gr.Column(scale=1):
|
|
@@ -794,13 +1036,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 794 |
btn = gr.Button("Transcribe", variant="primary")
|
| 795 |
|
| 796 |
with gr.Column(scale=1):
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
with gr.Row():
|
| 801 |
-
gallery = gr.Gallery(label="Line Crops", columns=4, height=200)
|
| 802 |
-
|
| 803 |
-
btn.click(process_image, input_img, [output_img, gallery, output_txt])
|
| 804 |
|
| 805 |
if __name__ == "__main__":
|
| 806 |
demo.launch()
|
|
|
|
| 590 |
|
| 591 |
|
| 592 |
|
| 593 |
+
# import gradio as gr
|
| 594 |
+
# import torch
|
| 595 |
+
# import numpy as np
|
| 596 |
+
# import cv2
|
| 597 |
+
# from PIL import Image
|
| 598 |
+
# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
| 599 |
+
# from paddleocr import PaddleOCR
|
| 600 |
+
|
| 601 |
+
# # Setup
|
| 602 |
+
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 603 |
+
# print(f"Loading TrOCR on {device}...")
|
| 604 |
+
# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
|
| 605 |
+
# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
|
| 606 |
+
|
| 607 |
+
# print("Loading PaddleOCR...")
|
| 608 |
+
# detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
|
| 609 |
+
# det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
|
| 610 |
+
|
| 611 |
+
# def calculate_iou(box1, box2):
|
| 612 |
+
# """Calculate Intersection over Union"""
|
| 613 |
+
# x1 = max(box1[0], box2[0])
|
| 614 |
+
# y1 = max(box1[1], box2[1])
|
| 615 |
+
# x2 = min(box1[2], box2[2])
|
| 616 |
+
# y2 = min(box1[3], box2[3])
|
| 617 |
+
|
| 618 |
+
# if x2 < x1 or y2 < y1:
|
| 619 |
+
# return 0.0
|
| 620 |
+
|
| 621 |
+
# intersection = (x2 - x1) * (y2 - y1)
|
| 622 |
+
# area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
| 623 |
+
# area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
| 624 |
+
|
| 625 |
+
# return intersection / min(area1, area2)
|
| 626 |
+
|
| 627 |
+
# def remove_nested_boxes(boxes, iou_thresh=0.7):
|
| 628 |
+
# """Remove boxes that are nested inside others"""
|
| 629 |
+
# if len(boxes) == 0:
|
| 630 |
+
# return []
|
| 631 |
+
|
| 632 |
+
# # Add area to each box
|
| 633 |
+
# boxes_with_area = []
|
| 634 |
+
# for b in boxes:
|
| 635 |
+
# area = (b[2] - b[0]) * (b[3] - b[1])
|
| 636 |
+
# boxes_with_area.append((*b, area))
|
| 637 |
+
|
| 638 |
+
# # Sort by area descending (keep larger boxes)
|
| 639 |
+
# boxes_with_area.sort(key=lambda x: x[4], reverse=True)
|
| 640 |
+
|
| 641 |
+
# keep = []
|
| 642 |
+
# for i, current in enumerate(boxes_with_area):
|
| 643 |
+
# should_keep = True
|
| 644 |
+
# curr_box = current[:4]
|
| 645 |
+
|
| 646 |
+
# for kept in keep:
|
| 647 |
+
# iou = calculate_iou(curr_box, kept)
|
| 648 |
+
# if iou > iou_thresh:
|
| 649 |
+
# should_keep = False
|
| 650 |
+
# break
|
| 651 |
+
|
| 652 |
+
# if should_keep:
|
| 653 |
+
# keep.append(curr_box)
|
| 654 |
+
|
| 655 |
+
# return keep
|
| 656 |
+
|
| 657 |
+
# def merge_boxes_into_lines(raw_boxes, y_overlap_thresh=0.5, x_gap_thresh=100):
|
| 658 |
+
# """Merge boxes into lines with better horizontal merging"""
|
| 659 |
+
# if raw_boxes is None or len(raw_boxes) == 0:
|
| 660 |
+
# return []
|
| 661 |
+
|
| 662 |
+
# # Convert polygons to rectangles
|
| 663 |
+
# rects = []
|
| 664 |
+
# for box in raw_boxes:
|
| 665 |
+
# box = np.array(box).astype(np.float32)
|
| 666 |
+
# x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
|
| 667 |
+
# x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
|
| 668 |
+
# rects.append([x1, y1, x2, y2])
|
| 669 |
+
|
| 670 |
+
# # Remove nested boxes
|
| 671 |
+
# rects = remove_nested_boxes(rects)
|
| 672 |
+
|
| 673 |
+
# if len(rects) == 0:
|
| 674 |
+
# return []
|
| 675 |
+
|
| 676 |
+
# # Sort by Y position
|
| 677 |
+
# rects.sort(key=lambda r: r[1])
|
| 678 |
+
|
| 679 |
+
# # Group into lines based on Y overlap
|
| 680 |
+
# lines = []
|
| 681 |
+
# current_line = [rects[0]]
|
| 682 |
+
|
| 683 |
+
# for rect in rects[1:]:
|
| 684 |
+
# # Check if rect belongs to current line
|
| 685 |
+
# line_y1 = min(r[1] for r in current_line)
|
| 686 |
+
# line_y2 = max(r[3] for r in current_line)
|
| 687 |
+
# line_height = line_y2 - line_y1
|
| 688 |
+
|
| 689 |
+
# rect_y1, rect_y2 = rect[1], rect[3]
|
| 690 |
+
# rect_height = rect_y2 - rect_y1
|
| 691 |
+
|
| 692 |
+
# # Calculate vertical overlap
|
| 693 |
+
# overlap_y1 = max(line_y1, rect_y1)
|
| 694 |
+
# overlap_y2 = min(line_y2, rect_y2)
|
| 695 |
+
# overlap = max(0, overlap_y2 - overlap_y1)
|
| 696 |
+
|
| 697 |
+
# # If significant vertical overlap, it's the same line
|
| 698 |
+
# if overlap > y_overlap_thresh * min(line_height, rect_height):
|
| 699 |
+
# current_line.append(rect)
|
| 700 |
+
# else:
|
| 701 |
+
# # Save current line and start new one
|
| 702 |
+
# lines.append(current_line)
|
| 703 |
+
# current_line = [rect]
|
| 704 |
+
|
| 705 |
+
# lines.append(current_line)
|
| 706 |
+
|
| 707 |
+
# # Merge boxes in each line
|
| 708 |
+
# merged = []
|
| 709 |
+
# for line in lines:
|
| 710 |
+
# # Sort line boxes left to right
|
| 711 |
+
# line.sort(key=lambda r: r[0])
|
| 712 |
+
|
| 713 |
+
# # Merge horizontally close boxes
|
| 714 |
+
# merged_line = [line[0]]
|
| 715 |
+
# for rect in line[1:]:
|
| 716 |
+
# last = merged_line[-1]
|
| 717 |
+
# # If close horizontally, merge
|
| 718 |
+
# if rect[0] - last[2] < x_gap_thresh:
|
| 719 |
+
# merged_line[-1] = [
|
| 720 |
+
# min(last[0], rect[0]),
|
| 721 |
+
# min(last[1], rect[1]),
|
| 722 |
+
# max(last[2], rect[2]),
|
| 723 |
+
# max(last[3], rect[3])
|
| 724 |
+
# ]
|
| 725 |
+
# else:
|
| 726 |
+
# merged_line.append(rect)
|
| 727 |
+
|
| 728 |
+
# # Final merge: combine all boxes in line into one
|
| 729 |
+
# x1 = min(r[0] for r in merged_line)
|
| 730 |
+
# y1 = min(r[1] for r in merged_line)
|
| 731 |
+
# x2 = max(r[2] for r in merged_line)
|
| 732 |
+
# y2 = max(r[3] for r in merged_line)
|
| 733 |
+
# merged.append([x1, y1, x2, y2])
|
| 734 |
+
|
| 735 |
+
# # Sort by Y
|
| 736 |
+
# merged.sort(key=lambda r: r[1])
|
| 737 |
+
# return merged
|
| 738 |
+
|
| 739 |
+
# def process_image(image):
|
| 740 |
+
# if image is None:
|
| 741 |
+
# return None, [], "Please upload an image."
|
| 742 |
+
|
| 743 |
+
# image_np = np.array(image.convert("RGB"))
|
| 744 |
+
|
| 745 |
+
# try:
|
| 746 |
+
# dt_boxes, _ = detector.text_detector(image_np)
|
| 747 |
+
# except Exception as e:
|
| 748 |
+
# return image, [], f"Detection Error: {str(e)}"
|
| 749 |
+
|
| 750 |
+
# if dt_boxes is None or len(dt_boxes) == 0:
|
| 751 |
+
# return image, [], "No text detected."
|
| 752 |
+
|
| 753 |
+
# line_boxes = merge_boxes_into_lines(dt_boxes)
|
| 754 |
+
|
| 755 |
+
# annotated_img = image_np.copy()
|
| 756 |
+
# results = []
|
| 757 |
+
# debug_crops = []
|
| 758 |
+
|
| 759 |
+
# for box in line_boxes:
|
| 760 |
+
# x1, y1, x2, y2 = map(int, box)
|
| 761 |
+
|
| 762 |
+
# if (x2 - x1) < 20 or (y2 - y1) < 15:
|
| 763 |
+
# continue
|
| 764 |
+
|
| 765 |
+
# cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
| 766 |
+
|
| 767 |
+
# PAD = 10
|
| 768 |
+
# h, w, _ = image_np.shape
|
| 769 |
+
# x1 = max(0, x1 - PAD)
|
| 770 |
+
# y1 = max(0, y1 - PAD)
|
| 771 |
+
# x2 = min(w, x2 + PAD)
|
| 772 |
+
# y2 = min(h, y2 + PAD)
|
| 773 |
+
|
| 774 |
+
# crop = image_np[y1:y2, x1:x2]
|
| 775 |
+
# pil_crop = Image.fromarray(crop)
|
| 776 |
+
# debug_crops.append(pil_crop)
|
| 777 |
+
|
| 778 |
+
# with torch.no_grad():
|
| 779 |
+
# pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
|
| 780 |
+
# generated_ids = model.generate(pixel_values)
|
| 781 |
+
# text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 782 |
+
# if text.strip():
|
| 783 |
+
# results.append(text)
|
| 784 |
+
|
| 785 |
+
# full_text = "\n".join(results)
|
| 786 |
+
# return Image.fromarray(annotated_img), debug_crops, full_text
|
| 787 |
+
|
| 788 |
+
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 789 |
+
# gr.Markdown("# ⚡ Smart Line-Level OCR (Fixed)")
|
| 790 |
+
|
| 791 |
+
# with gr.Row():
|
| 792 |
+
# with gr.Column(scale=1):
|
| 793 |
+
# input_img = gr.Image(type="pil", label="Upload Image")
|
| 794 |
+
# btn = gr.Button("Transcribe", variant="primary")
|
| 795 |
+
|
| 796 |
+
# with gr.Column(scale=1):
|
| 797 |
+
# output_img = gr.Image(label="Detected Lines")
|
| 798 |
+
# output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
|
| 799 |
+
|
| 800 |
+
# with gr.Row():
|
| 801 |
+
# gallery = gr.Gallery(label="Line Crops", columns=4, height=200)
|
| 802 |
+
|
| 803 |
+
# btn.click(process_image, input_img, [output_img, gallery, output_txt])
|
| 804 |
+
|
| 805 |
+
# if __name__ == "__main__":
|
| 806 |
+
# demo.launch()
|
| 807 |
+
|
| 808 |
+
|
| 809 |
+
|
| 810 |
+
|
| 811 |
+
|
| 812 |
+
|
| 813 |
+
|
| 814 |
+
|
| 815 |
+
|
| 816 |
+
|
| 817 |
import gradio as gr
|
| 818 |
import torch
|
| 819 |
import numpy as np
|
|
|
|
| 821 |
from PIL import Image
|
| 822 |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
| 823 |
from paddleocr import PaddleOCR
|
| 824 |
+
import pandas as pd
|
| 825 |
|
| 826 |
+
# --- 1. SETUP TR-OCR ---
|
| 827 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 828 |
print(f"Loading TrOCR on {device}...")
|
| 829 |
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
|
| 830 |
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
|
| 831 |
|
| 832 |
+
# --- 2. SETUP PADDLEOCR ---
|
| 833 |
print("Loading PaddleOCR...")
|
| 834 |
+
# High resolution settings to detect faint text
|
| 835 |
detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
|
| 836 |
det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
|
| 837 |
|
| 838 |
+
|
| 839 |
+
# ==========================================
|
| 840 |
+
# 🧠 LOGIC: INTERSECTION OVER UNION (IOU)
|
| 841 |
+
# ==========================================
|
| 842 |
+
def calculate_iou_containment(box1, box2):
|
| 843 |
+
"""
|
| 844 |
+
Calculates how much of box1 is inside box2.
|
| 845 |
+
Returns: ratio (0.0 to 1.0)
|
| 846 |
+
"""
|
| 847 |
x1 = max(box1[0], box2[0])
|
| 848 |
y1 = max(box1[1], box2[1])
|
| 849 |
x2 = min(box1[2], box2[2])
|
|
|
|
| 854 |
|
| 855 |
intersection = (x2 - x1) * (y2 - y1)
|
| 856 |
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
|
|
|
| 857 |
|
| 858 |
+
return intersection / area1
|
| 859 |
|
| 860 |
+
def filter_nested_boxes(boxes, containment_thresh=0.85):
|
| 861 |
+
"""
|
| 862 |
+
Removes boxes that are mostly contained within other larger boxes.
|
| 863 |
+
"""
|
| 864 |
+
if not boxes: return []
|
| 865 |
|
| 866 |
+
# [x1, y1, x2, y2, area]
|
| 867 |
+
active = []
|
| 868 |
for b in boxes:
|
| 869 |
area = (b[2] - b[0]) * (b[3] - b[1])
|
| 870 |
+
active.append(list(b) + [area])
|
| 871 |
|
| 872 |
+
# Sort by Area descending (Biggest first)
|
| 873 |
+
active.sort(key=lambda x: x[4], reverse=True)
|
| 874 |
|
| 875 |
+
final_boxes = []
|
| 876 |
+
|
| 877 |
+
for current in active:
|
| 878 |
+
is_nested = False
|
| 879 |
curr_box = current[:4]
|
| 880 |
|
| 881 |
+
# Check if this box is inside any bigger box we already kept
|
| 882 |
+
for kept in final_boxes:
|
| 883 |
+
overlap_ratio = calculate_iou_containment(curr_box, kept)
|
| 884 |
+
|
| 885 |
+
if overlap_ratio > containment_thresh:
|
| 886 |
+
is_nested = True
|
| 887 |
break
|
| 888 |
|
| 889 |
+
if not is_nested:
|
| 890 |
+
final_boxes.append(curr_box)
|
| 891 |
+
|
| 892 |
+
return final_boxes
|
| 893 |
+
|
| 894 |
|
| 895 |
+
# ==========================================
|
| 896 |
+
# 🧠 LOGIC: STRICT LINE MERGING
|
| 897 |
+
# ==========================================
|
| 898 |
+
def merge_boxes_into_lines(raw_boxes, log_data):
|
| 899 |
+
"""
|
| 900 |
+
Merges boxes horizontally but prevents vertical merging.
|
| 901 |
+
"""
|
| 902 |
if raw_boxes is None or len(raw_boxes) == 0:
|
| 903 |
return []
|
| 904 |
+
|
| 905 |
+
# 1. Convert to Rects
|
| 906 |
rects = []
|
| 907 |
for box in raw_boxes:
|
| 908 |
box = np.array(box).astype(np.float32)
|
| 909 |
x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
|
| 910 |
x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
|
| 911 |
rects.append([x1, y1, x2, y2])
|
| 912 |
+
|
| 913 |
+
log_data.append(f"Raw Detections: {len(rects)} boxes found.")
|
| 914 |
+
|
| 915 |
+
# 2. Filter Nested
|
| 916 |
+
rects = filter_nested_boxes(rects)
|
| 917 |
+
log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.")
|
| 918 |
+
|
| 919 |
+
# 3. Sort by Y-Center (Top to Bottom)
|
| 920 |
+
rects.sort(key=lambda r: (r[1] + r[3]) / 2)
|
| 921 |
+
|
|
|
|
| 922 |
lines = []
|
| 923 |
+
|
| 924 |
+
while rects:
|
| 925 |
+
# Start a new line with the highest remaining box
|
| 926 |
+
current_line = [rects.pop(0)]
|
| 927 |
+
|
| 928 |
+
# Calculate the dynamic "height" of this line based on the first word
|
| 929 |
+
ref_h = current_line[0][3] - current_line[0][1]
|
| 930 |
+
ref_y_center = (current_line[0][1] + current_line[0][3]) / 2
|
| 931 |
+
|
| 932 |
+
# Look for other words on this SAME line
|
| 933 |
+
# STRICT RULE: A box is on the same line ONLY if its Y-center
|
| 934 |
+
# is within 50% of the reference box's height.
|
| 935 |
+
vertical_tolerance = ref_h * 0.5
|
| 936 |
+
|
| 937 |
+
remaining_rects = []
|
| 938 |
+
for r in rects:
|
| 939 |
+
r_y_center = (r[1] + r[3]) / 2
|
| 940 |
+
|
| 941 |
+
if abs(r_y_center - ref_y_center) < vertical_tolerance:
|
| 942 |
+
current_line.append(r)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 943 |
else:
|
| 944 |
+
remaining_rects.append(r)
|
| 945 |
+
|
| 946 |
+
rects = remaining_rects
|
| 947 |
|
| 948 |
+
# Sort words in this line left-to-right
|
| 949 |
+
current_line.sort(key=lambda r: r[0])
|
| 950 |
+
|
| 951 |
+
# 4. Merge the horizontal group into ONE box
|
| 952 |
+
lx1 = min(r[0] for r in current_line)
|
| 953 |
+
ly1 = min(r[1] for r in current_line)
|
| 954 |
+
lx2 = max(r[2] for r in current_line)
|
| 955 |
+
ly2 = max(r[3] for r in current_line)
|
| 956 |
+
|
| 957 |
+
lines.append([lx1, ly1, lx2, ly2])
|
| 958 |
+
|
| 959 |
+
# Final Sort by Y
|
| 960 |
+
lines.sort(key=lambda r: r[1])
|
| 961 |
|
| 962 |
+
log_data.append(f"Final Merged Lines: {len(lines)} lines created.")
|
| 963 |
+
return lines
|
| 964 |
+
|
| 965 |
|
| 966 |
def process_image(image):
|
| 967 |
+
logs = [] # Store debug messages here
|
|
|
|
| 968 |
|
| 969 |
+
if image is None:
|
| 970 |
+
return None, [], "Please upload an image.", "No logs."
|
| 971 |
|
| 972 |
+
image_np = np.array(image.convert("RGB"))
|
| 973 |
+
|
| 974 |
+
# DETECT
|
| 975 |
try:
|
| 976 |
dt_boxes, _ = detector.text_detector(image_np)
|
| 977 |
except Exception as e:
|
| 978 |
+
return image, [], f"Detection Error: {str(e)}", "\n".join(logs)
|
| 979 |
+
|
| 980 |
if dt_boxes is None or len(dt_boxes) == 0:
|
| 981 |
+
return image, [], "No text detected.", "\n".join(logs)
|
| 982 |
|
| 983 |
+
# PROCESS
|
| 984 |
+
line_boxes = merge_boxes_into_lines(dt_boxes, logs)
|
| 985 |
|
| 986 |
annotated_img = image_np.copy()
|
| 987 |
results = []
|
| 988 |
debug_crops = []
|
| 989 |
|
| 990 |
+
# Log the final box coordinates for inspection
|
| 991 |
+
logs.append("\n--- Final Box Coordinates ---")
|
| 992 |
+
|
| 993 |
+
for i, box in enumerate(line_boxes):
|
| 994 |
x1, y1, x2, y2 = map(int, box)
|
| 995 |
|
| 996 |
+
logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}")
|
| 997 |
+
|
| 998 |
+
# Filter Noise
|
| 999 |
if (x2 - x1) < 20 or (y2 - y1) < 15:
|
| 1000 |
+
logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)")
|
| 1001 |
continue
|
| 1002 |
+
|
| 1003 |
+
# Draw (Green)
|
| 1004 |
cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
| 1005 |
|
| 1006 |
+
# PADDING
|
| 1007 |
PAD = 10
|
| 1008 |
h, w, _ = image_np.shape
|
| 1009 |
x1 = max(0, x1 - PAD)
|
|
|
|
| 1015 |
pil_crop = Image.fromarray(crop)
|
| 1016 |
debug_crops.append(pil_crop)
|
| 1017 |
|
| 1018 |
+
# RECOGNIZE
|
| 1019 |
with torch.no_grad():
|
| 1020 |
pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
|
| 1021 |
generated_ids = model.generate(pixel_values)
|
| 1022 |
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 1023 |
if text.strip():
|
| 1024 |
results.append(text)
|
| 1025 |
+
|
| 1026 |
full_text = "\n".join(results)
|
| 1027 |
+
return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
|
| 1028 |
|
| 1029 |
+
# --- UI ---
|
| 1030 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 1031 |
+
gr.Markdown("# ⚡ Smart Line-Level OCR (Debug Mode)")
|
| 1032 |
|
| 1033 |
with gr.Row():
|
| 1034 |
with gr.Column(scale=1):
|
|
|
|
| 1036 |
btn = gr.Button("Transcribe", variant="primary")
|
| 1037 |
|
| 1038 |
with gr.Column(scale=1):
|
| 1039 |
+
with gr.Tabs():
|
| 1040 |
+
with gr.Tab("Visualization"):
|
| 1041 |
+
output_img = gr.Image(label="Detected Lines")
|
| 1042 |
+
with gr.Tab("Extracted Text"):
|
| 1043 |
+
output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
|
| 1044 |
+
with gr.Tab("Debug Logs"):
|
| 1045 |
+
log_output = gr.Code(label="Processing Logs", language="text")
|
| 1046 |
+
|
| 1047 |
with gr.Row():
|
| 1048 |
+
gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)
|
| 1049 |
+
|
| 1050 |
+
btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output])
|
| 1051 |
|
| 1052 |
if __name__ == "__main__":
|
| 1053 |
demo.launch()
|