iammraat commited on
Commit
5bd1cd1
·
verified ·
1 Parent(s): fcf7044

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +359 -112
app.py CHANGED
@@ -590,6 +590,230 @@
590
 
591
 
592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  import gradio as gr
594
  import torch
595
  import numpy as np
@@ -597,19 +821,29 @@ import cv2
597
  from PIL import Image
598
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
599
  from paddleocr import PaddleOCR
 
600
 
601
- # Setup
602
  device = "cuda" if torch.cuda.is_available() else "cpu"
603
  print(f"Loading TrOCR on {device}...")
604
  processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
605
  model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
606
 
 
607
  print("Loading PaddleOCR...")
 
608
  detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
609
  det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
610
 
611
- def calculate_iou(box1, box2):
612
- """Calculate Intersection over Union"""
 
 
 
 
 
 
 
613
  x1 = max(box1[0], box2[0])
614
  y1 = max(box1[1], box2[1])
615
  x2 = min(box1[2], box2[2])
@@ -620,150 +854,156 @@ def calculate_iou(box1, box2):
620
 
621
  intersection = (x2 - x1) * (y2 - y1)
622
  area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
623
- area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
624
 
625
- return intersection / min(area1, area2)
626
 
627
- def remove_nested_boxes(boxes, iou_thresh=0.7):
628
- """Remove boxes that are nested inside others"""
629
- if len(boxes) == 0:
630
- return []
 
631
 
632
- # Add area to each box
633
- boxes_with_area = []
634
  for b in boxes:
635
  area = (b[2] - b[0]) * (b[3] - b[1])
636
- boxes_with_area.append((*b, area))
637
 
638
- # Sort by area descending (keep larger boxes)
639
- boxes_with_area.sort(key=lambda x: x[4], reverse=True)
640
 
641
- keep = []
642
- for i, current in enumerate(boxes_with_area):
643
- should_keep = True
 
644
  curr_box = current[:4]
645
 
646
- for kept in keep:
647
- iou = calculate_iou(curr_box, kept)
648
- if iou > iou_thresh:
649
- should_keep = False
 
 
650
  break
651
 
652
- if should_keep:
653
- keep.append(curr_box)
654
-
655
- return keep
 
656
 
657
- def merge_boxes_into_lines(raw_boxes, y_overlap_thresh=0.5, x_gap_thresh=100):
658
- """Merge boxes into lines with better horizontal merging"""
 
 
 
 
 
659
  if raw_boxes is None or len(raw_boxes) == 0:
660
  return []
661
-
662
- # Convert polygons to rectangles
663
  rects = []
664
  for box in raw_boxes:
665
  box = np.array(box).astype(np.float32)
666
  x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
667
  x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
668
  rects.append([x1, y1, x2, y2])
669
-
670
- # Remove nested boxes
671
- rects = remove_nested_boxes(rects)
672
-
673
- if len(rects) == 0:
674
- return []
675
-
676
- # Sort by Y position
677
- rects.sort(key=lambda r: r[1])
678
-
679
- # Group into lines based on Y overlap
680
  lines = []
681
- current_line = [rects[0]]
682
-
683
- for rect in rects[1:]:
684
- # Check if rect belongs to current line
685
- line_y1 = min(r[1] for r in current_line)
686
- line_y2 = max(r[3] for r in current_line)
687
- line_height = line_y2 - line_y1
688
-
689
- rect_y1, rect_y2 = rect[1], rect[3]
690
- rect_height = rect_y2 - rect_y1
691
-
692
- # Calculate vertical overlap
693
- overlap_y1 = max(line_y1, rect_y1)
694
- overlap_y2 = min(line_y2, rect_y2)
695
- overlap = max(0, overlap_y2 - overlap_y1)
696
-
697
- # If significant vertical overlap, it's the same line
698
- if overlap > y_overlap_thresh * min(line_height, rect_height):
699
- current_line.append(rect)
700
- else:
701
- # Save current line and start new one
702
- lines.append(current_line)
703
- current_line = [rect]
704
-
705
- lines.append(current_line)
706
-
707
- # Merge boxes in each line
708
- merged = []
709
- for line in lines:
710
- # Sort line boxes left to right
711
- line.sort(key=lambda r: r[0])
712
-
713
- # Merge horizontally close boxes
714
- merged_line = [line[0]]
715
- for rect in line[1:]:
716
- last = merged_line[-1]
717
- # If close horizontally, merge
718
- if rect[0] - last[2] < x_gap_thresh:
719
- merged_line[-1] = [
720
- min(last[0], rect[0]),
721
- min(last[1], rect[1]),
722
- max(last[2], rect[2]),
723
- max(last[3], rect[3])
724
- ]
725
  else:
726
- merged_line.append(rect)
 
 
727
 
728
- # Final merge: combine all boxes in line into one
729
- x1 = min(r[0] for r in merged_line)
730
- y1 = min(r[1] for r in merged_line)
731
- x2 = max(r[2] for r in merged_line)
732
- y2 = max(r[3] for r in merged_line)
733
- merged.append([x1, y1, x2, y2])
 
 
 
 
 
 
 
734
 
735
- # Sort by Y
736
- merged.sort(key=lambda r: r[1])
737
- return merged
738
 
739
  def process_image(image):
740
- if image is None:
741
- return None, [], "Please upload an image."
742
 
743
- image_np = np.array(image.convert("RGB"))
 
744
 
 
 
 
745
  try:
746
  dt_boxes, _ = detector.text_detector(image_np)
747
  except Exception as e:
748
- return image, [], f"Detection Error: {str(e)}"
749
-
750
  if dt_boxes is None or len(dt_boxes) == 0:
751
- return image, [], "No text detected."
752
 
753
- line_boxes = merge_boxes_into_lines(dt_boxes)
 
754
 
755
  annotated_img = image_np.copy()
756
  results = []
757
  debug_crops = []
758
 
759
- for box in line_boxes:
 
 
 
760
  x1, y1, x2, y2 = map(int, box)
761
 
 
 
 
762
  if (x2 - x1) < 20 or (y2 - y1) < 15:
 
763
  continue
764
-
 
765
  cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
766
 
 
767
  PAD = 10
768
  h, w, _ = image_np.shape
769
  x1 = max(0, x1 - PAD)
@@ -775,18 +1015,20 @@ def process_image(image):
775
  pil_crop = Image.fromarray(crop)
776
  debug_crops.append(pil_crop)
777
 
 
778
  with torch.no_grad():
779
  pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
780
  generated_ids = model.generate(pixel_values)
781
  text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
782
  if text.strip():
783
  results.append(text)
784
-
785
  full_text = "\n".join(results)
786
- return Image.fromarray(annotated_img), debug_crops, full_text
787
 
 
788
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
789
- gr.Markdown("# ⚡ Smart Line-Level OCR (Fixed)")
790
 
791
  with gr.Row():
792
  with gr.Column(scale=1):
@@ -794,13 +1036,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
794
  btn = gr.Button("Transcribe", variant="primary")
795
 
796
  with gr.Column(scale=1):
797
- output_img = gr.Image(label="Detected Lines")
798
- output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
799
-
 
 
 
 
 
800
  with gr.Row():
801
- gallery = gr.Gallery(label="Line Crops", columns=4, height=200)
802
-
803
- btn.click(process_image, input_img, [output_img, gallery, output_txt])
804
 
805
  if __name__ == "__main__":
806
  demo.launch()
 
590
 
591
 
592
 
593
+ # import gradio as gr
594
+ # import torch
595
+ # import numpy as np
596
+ # import cv2
597
+ # from PIL import Image
598
+ # from transformers import TrOCRProcessor, VisionEncoderDecoderModel
599
+ # from paddleocr import PaddleOCR
600
+
601
+ # # Setup
602
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
603
+ # print(f"Loading TrOCR on {device}...")
604
+ # processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
605
+ # model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
606
+
607
+ # print("Loading PaddleOCR...")
608
+ # detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
609
+ # det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
610
+
611
+ # def calculate_iou(box1, box2):
612
+ # """Calculate Intersection over Union"""
613
+ # x1 = max(box1[0], box2[0])
614
+ # y1 = max(box1[1], box2[1])
615
+ # x2 = min(box1[2], box2[2])
616
+ # y2 = min(box1[3], box2[3])
617
+
618
+ # if x2 < x1 or y2 < y1:
619
+ # return 0.0
620
+
621
+ # intersection = (x2 - x1) * (y2 - y1)
622
+ # area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
623
+ # area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
624
+
625
+ # return intersection / min(area1, area2)
626
+
627
+ # def remove_nested_boxes(boxes, iou_thresh=0.7):
628
+ # """Remove boxes that are nested inside others"""
629
+ # if len(boxes) == 0:
630
+ # return []
631
+
632
+ # # Add area to each box
633
+ # boxes_with_area = []
634
+ # for b in boxes:
635
+ # area = (b[2] - b[0]) * (b[3] - b[1])
636
+ # boxes_with_area.append((*b, area))
637
+
638
+ # # Sort by area descending (keep larger boxes)
639
+ # boxes_with_area.sort(key=lambda x: x[4], reverse=True)
640
+
641
+ # keep = []
642
+ # for i, current in enumerate(boxes_with_area):
643
+ # should_keep = True
644
+ # curr_box = current[:4]
645
+
646
+ # for kept in keep:
647
+ # iou = calculate_iou(curr_box, kept)
648
+ # if iou > iou_thresh:
649
+ # should_keep = False
650
+ # break
651
+
652
+ # if should_keep:
653
+ # keep.append(curr_box)
654
+
655
+ # return keep
656
+
657
+ # def merge_boxes_into_lines(raw_boxes, y_overlap_thresh=0.5, x_gap_thresh=100):
658
+ # """Merge boxes into lines with better horizontal merging"""
659
+ # if raw_boxes is None or len(raw_boxes) == 0:
660
+ # return []
661
+
662
+ # # Convert polygons to rectangles
663
+ # rects = []
664
+ # for box in raw_boxes:
665
+ # box = np.array(box).astype(np.float32)
666
+ # x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
667
+ # x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
668
+ # rects.append([x1, y1, x2, y2])
669
+
670
+ # # Remove nested boxes
671
+ # rects = remove_nested_boxes(rects)
672
+
673
+ # if len(rects) == 0:
674
+ # return []
675
+
676
+ # # Sort by Y position
677
+ # rects.sort(key=lambda r: r[1])
678
+
679
+ # # Group into lines based on Y overlap
680
+ # lines = []
681
+ # current_line = [rects[0]]
682
+
683
+ # for rect in rects[1:]:
684
+ # # Check if rect belongs to current line
685
+ # line_y1 = min(r[1] for r in current_line)
686
+ # line_y2 = max(r[3] for r in current_line)
687
+ # line_height = line_y2 - line_y1
688
+
689
+ # rect_y1, rect_y2 = rect[1], rect[3]
690
+ # rect_height = rect_y2 - rect_y1
691
+
692
+ # # Calculate vertical overlap
693
+ # overlap_y1 = max(line_y1, rect_y1)
694
+ # overlap_y2 = min(line_y2, rect_y2)
695
+ # overlap = max(0, overlap_y2 - overlap_y1)
696
+
697
+ # # If significant vertical overlap, it's the same line
698
+ # if overlap > y_overlap_thresh * min(line_height, rect_height):
699
+ # current_line.append(rect)
700
+ # else:
701
+ # # Save current line and start new one
702
+ # lines.append(current_line)
703
+ # current_line = [rect]
704
+
705
+ # lines.append(current_line)
706
+
707
+ # # Merge boxes in each line
708
+ # merged = []
709
+ # for line in lines:
710
+ # # Sort line boxes left to right
711
+ # line.sort(key=lambda r: r[0])
712
+
713
+ # # Merge horizontally close boxes
714
+ # merged_line = [line[0]]
715
+ # for rect in line[1:]:
716
+ # last = merged_line[-1]
717
+ # # If close horizontally, merge
718
+ # if rect[0] - last[2] < x_gap_thresh:
719
+ # merged_line[-1] = [
720
+ # min(last[0], rect[0]),
721
+ # min(last[1], rect[1]),
722
+ # max(last[2], rect[2]),
723
+ # max(last[3], rect[3])
724
+ # ]
725
+ # else:
726
+ # merged_line.append(rect)
727
+
728
+ # # Final merge: combine all boxes in line into one
729
+ # x1 = min(r[0] for r in merged_line)
730
+ # y1 = min(r[1] for r in merged_line)
731
+ # x2 = max(r[2] for r in merged_line)
732
+ # y2 = max(r[3] for r in merged_line)
733
+ # merged.append([x1, y1, x2, y2])
734
+
735
+ # # Sort by Y
736
+ # merged.sort(key=lambda r: r[1])
737
+ # return merged
738
+
739
+ # def process_image(image):
740
+ # if image is None:
741
+ # return None, [], "Please upload an image."
742
+
743
+ # image_np = np.array(image.convert("RGB"))
744
+
745
+ # try:
746
+ # dt_boxes, _ = detector.text_detector(image_np)
747
+ # except Exception as e:
748
+ # return image, [], f"Detection Error: {str(e)}"
749
+
750
+ # if dt_boxes is None or len(dt_boxes) == 0:
751
+ # return image, [], "No text detected."
752
+
753
+ # line_boxes = merge_boxes_into_lines(dt_boxes)
754
+
755
+ # annotated_img = image_np.copy()
756
+ # results = []
757
+ # debug_crops = []
758
+
759
+ # for box in line_boxes:
760
+ # x1, y1, x2, y2 = map(int, box)
761
+
762
+ # if (x2 - x1) < 20 or (y2 - y1) < 15:
763
+ # continue
764
+
765
+ # cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
766
+
767
+ # PAD = 10
768
+ # h, w, _ = image_np.shape
769
+ # x1 = max(0, x1 - PAD)
770
+ # y1 = max(0, y1 - PAD)
771
+ # x2 = min(w, x2 + PAD)
772
+ # y2 = min(h, y2 + PAD)
773
+
774
+ # crop = image_np[y1:y2, x1:x2]
775
+ # pil_crop = Image.fromarray(crop)
776
+ # debug_crops.append(pil_crop)
777
+
778
+ # with torch.no_grad():
779
+ # pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
780
+ # generated_ids = model.generate(pixel_values)
781
+ # text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
782
+ # if text.strip():
783
+ # results.append(text)
784
+
785
+ # full_text = "\n".join(results)
786
+ # return Image.fromarray(annotated_img), debug_crops, full_text
787
+
788
+ # with gr.Blocks(theme=gr.themes.Soft()) as demo:
789
+ # gr.Markdown("# ⚡ Smart Line-Level OCR (Fixed)")
790
+
791
+ # with gr.Row():
792
+ # with gr.Column(scale=1):
793
+ # input_img = gr.Image(type="pil", label="Upload Image")
794
+ # btn = gr.Button("Transcribe", variant="primary")
795
+
796
+ # with gr.Column(scale=1):
797
+ # output_img = gr.Image(label="Detected Lines")
798
+ # output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
799
+
800
+ # with gr.Row():
801
+ # gallery = gr.Gallery(label="Line Crops", columns=4, height=200)
802
+
803
+ # btn.click(process_image, input_img, [output_img, gallery, output_txt])
804
+
805
+ # if __name__ == "__main__":
806
+ # demo.launch()
807
+
808
+
809
+
810
+
811
+
812
+
813
+
814
+
815
+
816
+
817
  import gradio as gr
818
  import torch
819
  import numpy as np
 
821
  from PIL import Image
822
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
823
  from paddleocr import PaddleOCR
824
+ import pandas as pd
825
 
826
+ # --- 1. SETUP TR-OCR ---
827
  device = "cuda" if torch.cuda.is_available() else "cpu"
828
  print(f"Loading TrOCR on {device}...")
829
  processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
830
  model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
831
 
832
+ # --- 2. SETUP PADDLEOCR ---
833
  print("Loading PaddleOCR...")
834
+ # High resolution settings to detect faint text
835
  detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
836
  det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
837
 
838
+
839
+ # ==========================================
840
+ # 🧠 LOGIC: INTERSECTION OVER UNION (IOU)
841
+ # ==========================================
842
+ def calculate_iou_containment(box1, box2):
843
+ """
844
+ Calculates how much of box1 is inside box2.
845
+ Returns: ratio (0.0 to 1.0)
846
+ """
847
  x1 = max(box1[0], box2[0])
848
  y1 = max(box1[1], box2[1])
849
  x2 = min(box1[2], box2[2])
 
854
 
855
  intersection = (x2 - x1) * (y2 - y1)
856
  area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
 
857
 
858
+ return intersection / area1
859
 
860
+ def filter_nested_boxes(boxes, containment_thresh=0.85):
861
+ """
862
+ Removes boxes that are mostly contained within other larger boxes.
863
+ """
864
+ if not boxes: return []
865
 
866
+ # [x1, y1, x2, y2, area]
867
+ active = []
868
  for b in boxes:
869
  area = (b[2] - b[0]) * (b[3] - b[1])
870
+ active.append(list(b) + [area])
871
 
872
+ # Sort by Area descending (Biggest first)
873
+ active.sort(key=lambda x: x[4], reverse=True)
874
 
875
+ final_boxes = []
876
+
877
+ for current in active:
878
+ is_nested = False
879
  curr_box = current[:4]
880
 
881
+ # Check if this box is inside any bigger box we already kept
882
+ for kept in final_boxes:
883
+ overlap_ratio = calculate_iou_containment(curr_box, kept)
884
+
885
+ if overlap_ratio > containment_thresh:
886
+ is_nested = True
887
  break
888
 
889
+ if not is_nested:
890
+ final_boxes.append(curr_box)
891
+
892
+ return final_boxes
893
+
894
 
895
+ # ==========================================
896
+ # 🧠 LOGIC: STRICT LINE MERGING
897
+ # ==========================================
898
+ def merge_boxes_into_lines(raw_boxes, log_data):
899
+ """
900
+ Merges boxes horizontally but prevents vertical merging.
901
+ """
902
  if raw_boxes is None or len(raw_boxes) == 0:
903
  return []
904
+
905
+ # 1. Convert to Rects
906
  rects = []
907
  for box in raw_boxes:
908
  box = np.array(box).astype(np.float32)
909
  x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
910
  x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
911
  rects.append([x1, y1, x2, y2])
912
+
913
+ log_data.append(f"Raw Detections: {len(rects)} boxes found.")
914
+
915
+ # 2. Filter Nested
916
+ rects = filter_nested_boxes(rects)
917
+ log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.")
918
+
919
+ # 3. Sort by Y-Center (Top to Bottom)
920
+ rects.sort(key=lambda r: (r[1] + r[3]) / 2)
921
+
 
922
  lines = []
923
+
924
+ while rects:
925
+ # Start a new line with the highest remaining box
926
+ current_line = [rects.pop(0)]
927
+
928
+ # Calculate the dynamic "height" of this line based on the first word
929
+ ref_h = current_line[0][3] - current_line[0][1]
930
+ ref_y_center = (current_line[0][1] + current_line[0][3]) / 2
931
+
932
+ # Look for other words on this SAME line
933
+ # STRICT RULE: A box is on the same line ONLY if its Y-center
934
+ # is within 50% of the reference box's height.
935
+ vertical_tolerance = ref_h * 0.5
936
+
937
+ remaining_rects = []
938
+ for r in rects:
939
+ r_y_center = (r[1] + r[3]) / 2
940
+
941
+ if abs(r_y_center - ref_y_center) < vertical_tolerance:
942
+ current_line.append(r)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
943
  else:
944
+ remaining_rects.append(r)
945
+
946
+ rects = remaining_rects
947
 
948
+ # Sort words in this line left-to-right
949
+ current_line.sort(key=lambda r: r[0])
950
+
951
+ # 4. Merge the horizontal group into ONE box
952
+ lx1 = min(r[0] for r in current_line)
953
+ ly1 = min(r[1] for r in current_line)
954
+ lx2 = max(r[2] for r in current_line)
955
+ ly2 = max(r[3] for r in current_line)
956
+
957
+ lines.append([lx1, ly1, lx2, ly2])
958
+
959
+ # Final Sort by Y
960
+ lines.sort(key=lambda r: r[1])
961
 
962
+ log_data.append(f"Final Merged Lines: {len(lines)} lines created.")
963
+ return lines
964
+
965
 
966
  def process_image(image):
967
+ logs = [] # Store debug messages here
 
968
 
969
+ if image is None:
970
+ return None, [], "Please upload an image.", "No logs."
971
 
972
+ image_np = np.array(image.convert("RGB"))
973
+
974
+ # DETECT
975
  try:
976
  dt_boxes, _ = detector.text_detector(image_np)
977
  except Exception as e:
978
+ return image, [], f"Detection Error: {str(e)}", "\n".join(logs)
979
+
980
  if dt_boxes is None or len(dt_boxes) == 0:
981
+ return image, [], "No text detected.", "\n".join(logs)
982
 
983
+ # PROCESS
984
+ line_boxes = merge_boxes_into_lines(dt_boxes, logs)
985
 
986
  annotated_img = image_np.copy()
987
  results = []
988
  debug_crops = []
989
 
990
+ # Log the final box coordinates for inspection
991
+ logs.append("\n--- Final Box Coordinates ---")
992
+
993
+ for i, box in enumerate(line_boxes):
994
  x1, y1, x2, y2 = map(int, box)
995
 
996
+ logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}")
997
+
998
+ # Filter Noise
999
  if (x2 - x1) < 20 or (y2 - y1) < 15:
1000
+ logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)")
1001
  continue
1002
+
1003
+ # Draw (Green)
1004
  cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
1005
 
1006
+ # PADDING
1007
  PAD = 10
1008
  h, w, _ = image_np.shape
1009
  x1 = max(0, x1 - PAD)
 
1015
  pil_crop = Image.fromarray(crop)
1016
  debug_crops.append(pil_crop)
1017
 
1018
+ # RECOGNIZE
1019
  with torch.no_grad():
1020
  pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
1021
  generated_ids = model.generate(pixel_values)
1022
  text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1023
  if text.strip():
1024
  results.append(text)
1025
+
1026
  full_text = "\n".join(results)
1027
+ return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
1028
 
1029
+ # --- UI ---
1030
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
1031
+ gr.Markdown("# ⚡ Smart Line-Level OCR (Debug Mode)")
1032
 
1033
  with gr.Row():
1034
  with gr.Column(scale=1):
 
1036
  btn = gr.Button("Transcribe", variant="primary")
1037
 
1038
  with gr.Column(scale=1):
1039
+ with gr.Tabs():
1040
+ with gr.Tab("Visualization"):
1041
+ output_img = gr.Image(label="Detected Lines")
1042
+ with gr.Tab("Extracted Text"):
1043
+ output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
1044
+ with gr.Tab("Debug Logs"):
1045
+ log_output = gr.Code(label="Processing Logs", language="text")
1046
+
1047
  with gr.Row():
1048
+ gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)
1049
+
1050
+ btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output])
1051
 
1052
  if __name__ == "__main__":
1053
  demo.launch()