iammraat commited on
Commit
a0c16c2
Β·
verified Β·
1 Parent(s): 5f2a6f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +601 -151
app.py CHANGED
@@ -816,6 +816,257 @@
816
 
817
 
818
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
819
  import gradio as gr
820
  import torch
821
  import numpy as np
@@ -823,64 +1074,187 @@ import cv2
823
  from PIL import Image
824
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
825
  from paddleocr import PaddleOCR
826
- import pandas as pd
 
 
 
827
 
828
- # --- 1. SETUP TR-OCR ---
 
 
829
  device = "cuda" if torch.cuda.is_available() else "cpu"
830
  print(f"Loading TrOCR on {device}...")
831
- processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
832
- model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
833
 
834
- # --- 2. SETUP PADDLEOCR ---
 
 
 
835
  print("Loading PaddleOCR...")
836
- # High resolution settings to detect faint text
837
- detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
838
- det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
 
 
 
 
 
 
 
839
 
840
 
841
  # ==========================================
842
- # 🧠 LOGIC: INTERSECTION OVER UNION (IOU)
843
  # ==========================================
844
- def calculate_iou_containment(box1, box2):
845
  """
846
- Calculates how much of box1 is inside box2.
847
- Returns: ratio (0.0 to 1.0)
848
  """
849
- x1 = max(box1[0], box2[0])
850
- y1 = max(box1[1], box2[1])
851
- x2 = min(box1[2], box2[2])
852
- y2 = min(box1[3], box2[3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
853
 
854
- if x2 < x1 or y2 < y1:
855
- return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
856
 
857
- intersection = (x2 - x1) * (y2 - y1)
858
- area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
859
 
860
- return intersection / area1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
861
 
862
  def filter_nested_boxes(boxes, containment_thresh=0.85):
863
  """
864
  Removes boxes that are mostly contained within other larger boxes.
 
865
  """
866
- if not boxes: return []
 
867
 
868
- # [x1, y1, x2, y2, area]
869
- active = []
870
  for b in boxes:
871
  area = (b[2] - b[0]) * (b[3] - b[1])
872
- active.append(list(b) + [area])
873
 
874
- # Sort by Area descending (Biggest first)
875
- active.sort(key=lambda x: x[4], reverse=True)
876
 
877
  final_boxes = []
878
 
879
- for current in active:
880
  is_nested = False
881
  curr_box = current[:4]
882
 
883
- # Check if this box is inside any bigger box we already kept
884
  for kept in final_boxes:
885
  overlap_ratio = calculate_iou_containment(curr_box, kept)
886
 
@@ -894,173 +1268,249 @@ def filter_nested_boxes(boxes, containment_thresh=0.85):
894
  return final_boxes
895
 
896
 
897
- # ==========================================
898
- # 🧠 LOGIC: STRICT LINE MERGING
899
- # ==========================================
900
- def merge_boxes_into_lines(raw_boxes, log_data):
901
  """
902
- Merges boxes horizontally but prevents vertical merging.
 
903
  """
904
- if raw_boxes is None or len(raw_boxes) == 0:
905
- return []
906
-
907
- # 1. Convert to Rects
908
- rects = []
909
- for box in raw_boxes:
910
- box = np.array(box).astype(np.float32)
911
- x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
912
- x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
913
- rects.append([x1, y1, x2, y2])
914
-
915
- log_data.append(f"Raw Detections: {len(rects)} boxes found.")
916
-
917
- # 2. Filter Nested
918
- rects = filter_nested_boxes(rects)
919
- log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.")
920
 
921
- # 3. Sort by Y-Center (Top to Bottom)
922
- rects.sort(key=lambda r: (r[1] + r[3]) / 2)
923
 
924
- lines = []
 
 
 
 
 
 
 
925
 
926
- while rects:
927
- # Start a new line with the highest remaining box
928
- current_line = [rects.pop(0)]
929
-
930
- # Calculate the dynamic "height" of this line based on the first word
931
- ref_h = current_line[0][3] - current_line[0][1]
932
- ref_y_center = (current_line[0][1] + current_line[0][3]) / 2
933
 
934
- # Look for other words on this SAME line
935
- # STRICT RULE: A box is on the same line ONLY if its Y-center
936
- # is within 50% of the reference box's height.
937
- vertical_tolerance = ref_h * 0.5
938
-
939
- remaining_rects = []
940
- for r in rects:
941
- r_y_center = (r[1] + r[3]) / 2
942
 
943
- if abs(r_y_center - ref_y_center) < vertical_tolerance:
944
- current_line.append(r)
945
- else:
946
- remaining_rects.append(r)
947
-
948
- rects = remaining_rects
949
-
950
- # Sort words in this line left-to-right
951
- current_line.sort(key=lambda r: r[0])
952
-
953
- # 4. Merge the horizontal group into ONE box
954
- lx1 = min(r[0] for r in current_line)
955
- ly1 = min(r[1] for r in current_line)
956
- lx2 = max(r[2] for r in current_line)
957
- ly2 = max(r[3] for r in current_line)
958
-
959
- lines.append([lx1, ly1, lx2, ly2])
960
-
961
- # Final Sort by Y
962
- lines.sort(key=lambda r: r[1])
963
 
964
- log_data.append(f"Final Merged Lines: {len(lines)} lines created.")
965
- return lines
966
 
967
 
968
- def process_image(image):
969
- logs = [] # Store debug messages here
 
 
 
 
 
 
970
 
971
  if image is None:
972
- return None, [], "Please upload an image.", "No logs."
 
 
 
 
973
 
 
974
  image_np = np.array(image.convert("RGB"))
975
-
976
- # DETECT
 
 
 
 
 
 
 
 
 
 
 
977
  try:
978
- dt_boxes, _ = detector.text_detector(image_np)
979
  except Exception as e:
980
- return image, [], f"Detection Error: {str(e)}", "\n".join(logs)
 
 
981
 
982
  if dt_boxes is None or len(dt_boxes) == 0:
983
- return image, [], "No text detected.", "\n".join(logs)
 
 
984
 
985
- # PROCESS
986
- line_boxes = merge_boxes_into_lines(dt_boxes, logs)
 
987
 
988
- annotated_img = image_np.copy()
989
- results = []
990
- debug_crops = []
 
991
 
992
- # Log the final box coordinates for inspection
993
- logs.append("\n--- Final Box Coordinates ---")
 
 
 
 
 
994
 
995
  for i, box in enumerate(line_boxes):
996
  x1, y1, x2, y2 = map(int, box)
997
 
998
- logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}")
999
 
1000
- # Filter Noise
1001
- if (x2 - x1) < 20 or (y2 - y1) < 15:
1002
- logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)")
1003
- continue
1004
-
1005
- # Draw (Green)
1006
- cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
1007
 
1008
- # PADDING
1009
  PAD = 10
1010
- h, w, _ = image_np.shape
1011
- x1 = max(0, x1 - PAD)
1012
- y1 = max(0, y1 - PAD)
1013
- x2 = min(w, x2 + PAD)
1014
- y2 = min(h, y2 + PAD)
1015
 
1016
- crop = image_np[y1:y2, x1:x2]
 
1017
  pil_crop = Image.fromarray(crop)
 
1018
  debug_crops.append(pil_crop)
1019
-
1020
- # RECOGNIZE
1021
- with torch.no_grad():
1022
- pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
1023
- generated_ids = model.generate(pixel_values)
1024
- text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1025
- if text.strip():
1026
- results.append(text)
1027
-
 
 
 
 
 
 
 
 
 
 
 
 
 
1028
  full_text = "\n".join(results)
 
 
 
 
 
1029
  return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
1030
 
1031
- # --- UI ---
1032
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
1033
- gr.Markdown("# ⚑ Smart Line-Level OCR (Debug Mode)")
 
 
 
 
 
 
 
 
 
 
 
 
1034
 
1035
  with gr.Row():
1036
  with gr.Column(scale=1):
1037
- input_img = gr.Image(type="pil", label="Upload Image")
1038
- btn = gr.Button("Transcribe", variant="primary")
 
 
 
 
 
 
 
 
1039
 
1040
  with gr.Column(scale=1):
1041
  with gr.Tabs():
1042
- with gr.Tab("Visualization"):
1043
  output_img = gr.Image(label="Detected Lines")
1044
- with gr.Tab("Extracted Text"):
1045
- output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
1046
- with gr.Tab("Debug Logs"):
1047
- # CHANGED HERE: Uses Textbox instead of Code to avoid version errors
1048
- log_output = gr.Textbox(label="Processing Logs", lines=20, interactive=False)
1049
-
 
 
 
 
 
 
 
 
 
 
 
1050
  with gr.Row():
1051
- gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)
1052
-
1053
- btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1054
 
1055
  if __name__ == "__main__":
1056
  demo.launch()
1057
 
1058
 
1059
 
1060
-
1061
-
1062
-
1063
-
1064
-
1065
-
1066
-
 
816
 
817
 
818
 
819
+ # import gradio as gr
820
+ # import torch
821
+ # import numpy as np
822
+ # import cv2
823
+ # from PIL import Image
824
+ # from transformers import TrOCRProcessor, VisionEncoderDecoderModel
825
+ # from paddleocr import PaddleOCR
826
+ # import pandas as pd
827
+
828
+ # # --- 1. SETUP TR-OCR ---
829
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
830
+ # print(f"Loading TrOCR on {device}...")
831
+ # processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
832
+ # model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
833
+
834
+ # # --- 2. SETUP PADDLEOCR ---
835
+ # print("Loading PaddleOCR...")
836
+ # # High resolution settings to detect faint text
837
+ # detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
838
+ # det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
839
+
840
+
841
+ # # ==========================================
842
+ # # 🧠 LOGIC: INTERSECTION OVER UNION (IOU)
843
+ # # ==========================================
844
+ # def calculate_iou_containment(box1, box2):
845
+ # """
846
+ # Calculates how much of box1 is inside box2.
847
+ # Returns: ratio (0.0 to 1.0)
848
+ # """
849
+ # x1 = max(box1[0], box2[0])
850
+ # y1 = max(box1[1], box2[1])
851
+ # x2 = min(box1[2], box2[2])
852
+ # y2 = min(box1[3], box2[3])
853
+
854
+ # if x2 < x1 or y2 < y1:
855
+ # return 0.0
856
+
857
+ # intersection = (x2 - x1) * (y2 - y1)
858
+ # area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
859
+
860
+ # return intersection / area1
861
+
862
+ # def filter_nested_boxes(boxes, containment_thresh=0.85):
863
+ # """
864
+ # Removes boxes that are mostly contained within other larger boxes.
865
+ # """
866
+ # if not boxes: return []
867
+
868
+ # # [x1, y1, x2, y2, area]
869
+ # active = []
870
+ # for b in boxes:
871
+ # area = (b[2] - b[0]) * (b[3] - b[1])
872
+ # active.append(list(b) + [area])
873
+
874
+ # # Sort by Area descending (Biggest first)
875
+ # active.sort(key=lambda x: x[4], reverse=True)
876
+
877
+ # final_boxes = []
878
+
879
+ # for current in active:
880
+ # is_nested = False
881
+ # curr_box = current[:4]
882
+
883
+ # # Check if this box is inside any bigger box we already kept
884
+ # for kept in final_boxes:
885
+ # overlap_ratio = calculate_iou_containment(curr_box, kept)
886
+
887
+ # if overlap_ratio > containment_thresh:
888
+ # is_nested = True
889
+ # break
890
+
891
+ # if not is_nested:
892
+ # final_boxes.append(curr_box)
893
+
894
+ # return final_boxes
895
+
896
+
897
+ # # ==========================================
898
+ # # 🧠 LOGIC: STRICT LINE MERGING
899
+ # # ==========================================
900
+ # def merge_boxes_into_lines(raw_boxes, log_data):
901
+ # """
902
+ # Merges boxes horizontally but prevents vertical merging.
903
+ # """
904
+ # if raw_boxes is None or len(raw_boxes) == 0:
905
+ # return []
906
+
907
+ # # 1. Convert to Rects
908
+ # rects = []
909
+ # for box in raw_boxes:
910
+ # box = np.array(box).astype(np.float32)
911
+ # x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
912
+ # x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
913
+ # rects.append([x1, y1, x2, y2])
914
+
915
+ # log_data.append(f"Raw Detections: {len(rects)} boxes found.")
916
+
917
+ # # 2. Filter Nested
918
+ # rects = filter_nested_boxes(rects)
919
+ # log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.")
920
+
921
+ # # 3. Sort by Y-Center (Top to Bottom)
922
+ # rects.sort(key=lambda r: (r[1] + r[3]) / 2)
923
+
924
+ # lines = []
925
+
926
+ # while rects:
927
+ # # Start a new line with the highest remaining box
928
+ # current_line = [rects.pop(0)]
929
+
930
+ # # Calculate the dynamic "height" of this line based on the first word
931
+ # ref_h = current_line[0][3] - current_line[0][1]
932
+ # ref_y_center = (current_line[0][1] + current_line[0][3]) / 2
933
+
934
+ # # Look for other words on this SAME line
935
+ # # STRICT RULE: A box is on the same line ONLY if its Y-center
936
+ # # is within 50% of the reference box's height.
937
+ # vertical_tolerance = ref_h * 0.5
938
+
939
+ # remaining_rects = []
940
+ # for r in rects:
941
+ # r_y_center = (r[1] + r[3]) / 2
942
+
943
+ # if abs(r_y_center - ref_y_center) < vertical_tolerance:
944
+ # current_line.append(r)
945
+ # else:
946
+ # remaining_rects.append(r)
947
+
948
+ # rects = remaining_rects
949
+
950
+ # # Sort words in this line left-to-right
951
+ # current_line.sort(key=lambda r: r[0])
952
+
953
+ # # 4. Merge the horizontal group into ONE box
954
+ # lx1 = min(r[0] for r in current_line)
955
+ # ly1 = min(r[1] for r in current_line)
956
+ # lx2 = max(r[2] for r in current_line)
957
+ # ly2 = max(r[3] for r in current_line)
958
+
959
+ # lines.append([lx1, ly1, lx2, ly2])
960
+
961
+ # # Final Sort by Y
962
+ # lines.sort(key=lambda r: r[1])
963
+
964
+ # log_data.append(f"Final Merged Lines: {len(lines)} lines created.")
965
+ # return lines
966
+
967
+
968
+ # def process_image(image):
969
+ # logs = [] # Store debug messages here
970
+
971
+ # if image is None:
972
+ # return None, [], "Please upload an image.", "No logs."
973
+
974
+ # image_np = np.array(image.convert("RGB"))
975
+
976
+ # # DETECT
977
+ # try:
978
+ # dt_boxes, _ = detector.text_detector(image_np)
979
+ # except Exception as e:
980
+ # return image, [], f"Detection Error: {str(e)}", "\n".join(logs)
981
+
982
+ # if dt_boxes is None or len(dt_boxes) == 0:
983
+ # return image, [], "No text detected.", "\n".join(logs)
984
+
985
+ # # PROCESS
986
+ # line_boxes = merge_boxes_into_lines(dt_boxes, logs)
987
+
988
+ # annotated_img = image_np.copy()
989
+ # results = []
990
+ # debug_crops = []
991
+
992
+ # # Log the final box coordinates for inspection
993
+ # logs.append("\n--- Final Box Coordinates ---")
994
+
995
+ # for i, box in enumerate(line_boxes):
996
+ # x1, y1, x2, y2 = map(int, box)
997
+
998
+ # logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}")
999
+
1000
+ # # Filter Noise
1001
+ # if (x2 - x1) < 20 or (y2 - y1) < 15:
1002
+ # logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)")
1003
+ # continue
1004
+
1005
+ # # Draw (Green)
1006
+ # cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
1007
+
1008
+ # # PADDING
1009
+ # PAD = 10
1010
+ # h, w, _ = image_np.shape
1011
+ # x1 = max(0, x1 - PAD)
1012
+ # y1 = max(0, y1 - PAD)
1013
+ # x2 = min(w, x2 + PAD)
1014
+ # y2 = min(h, y2 + PAD)
1015
+
1016
+ # crop = image_np[y1:y2, x1:x2]
1017
+ # pil_crop = Image.fromarray(crop)
1018
+ # debug_crops.append(pil_crop)
1019
+
1020
+ # # RECOGNIZE
1021
+ # with torch.no_grad():
1022
+ # pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
1023
+ # generated_ids = model.generate(pixel_values)
1024
+ # text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1025
+ # if text.strip():
1026
+ # results.append(text)
1027
+
1028
+ # full_text = "\n".join(results)
1029
+ # return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
1030
+
1031
+ # # --- UI ---
1032
+ # with gr.Blocks(theme=gr.themes.Soft()) as demo:
1033
+ # gr.Markdown("# ⚑ Smart Line-Level OCR (Debug Mode)")
1034
+
1035
+ # with gr.Row():
1036
+ # with gr.Column(scale=1):
1037
+ # input_img = gr.Image(type="pil", label="Upload Image")
1038
+ # btn = gr.Button("Transcribe", variant="primary")
1039
+
1040
+ # with gr.Column(scale=1):
1041
+ # with gr.Tabs():
1042
+ # with gr.Tab("Visualization"):
1043
+ # output_img = gr.Image(label="Detected Lines")
1044
+ # with gr.Tab("Extracted Text"):
1045
+ # output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
1046
+ # with gr.Tab("Debug Logs"):
1047
+ # # CHANGED HERE: Uses Textbox instead of Code to avoid version errors
1048
+ # log_output = gr.Textbox(label="Processing Logs", lines=20, interactive=False)
1049
+
1050
+ # with gr.Row():
1051
+ # gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)
1052
+
1053
+ # btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output])
1054
+
1055
+ # if __name__ == "__main__":
1056
+ # demo.launch()
1057
+
1058
+
1059
+
1060
+
1061
+
1062
+
1063
+
1064
+
1065
+
1066
+
1067
+
1068
+
1069
+
1070
  import gradio as gr
1071
  import torch
1072
  import numpy as np
 
1074
  from PIL import Image
1075
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
1076
  from paddleocr import PaddleOCR
1077
+ from sklearn.cluster import DBSCAN
1078
+ from scipy.spatial.distance import pdist, squareform
1079
+ import warnings
1080
+ warnings.filterwarnings('ignore')
1081
 
1082
+ # ==========================================
1083
+ # πŸš€ SETUP MODELS
1084
+ # ==========================================
1085
  device = "cuda" if torch.cuda.is_available() else "cpu"
1086
  print(f"Loading TrOCR on {device}...")
 
 
1087
 
1088
+ # Upgraded to TrOCR-Large for better accuracy
1089
+ processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-handwritten')
1090
+ model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten').to(device).eval()
1091
+
1092
  print("Loading PaddleOCR...")
1093
+ # Optimized settings for handwriting detection
1094
+ detector = PaddleOCR(
1095
+ use_angle_cls=True,
1096
+ lang='en',
1097
+ show_log=False,
1098
+ det_limit_side_len=2500, # High resolution
1099
+ det_db_thresh=0.2, # More sensitive threshold
1100
+ det_db_box_thresh=0.4, # Better box filtering
1101
+ det_db_unclip_ratio=1.8 # Larger text regions for handwriting
1102
+ )
1103
 
1104
 
1105
  # ==========================================
1106
+ # 🧠 PREPROCESSING FOR HANDWRITING
1107
  # ==========================================
1108
+ def preprocess_for_handwriting(image_np):
1109
  """
1110
+ Enhanced preprocessing specifically for handwriting.
1111
+ Returns preprocessed image for better detection.
1112
  """
1113
+ # Convert to grayscale
1114
+ gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
1115
+
1116
+ # Apply bilateral filter to reduce noise while preserving edges
1117
+ denoised = cv2.bilateralFilter(gray, 9, 75, 75)
1118
+
1119
+ # Adaptive thresholding (better for varying lighting)
1120
+ binary = cv2.adaptiveThreshold(
1121
+ denoised, 255,
1122
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
1123
+ cv2.THRESH_BINARY,
1124
+ 15, 10
1125
+ )
1126
+
1127
+ # Optional: Deskew the image
1128
+ coords = np.column_stack(np.where(binary > 0))
1129
+ if len(coords) > 0:
1130
+ angle = cv2.minAreaRect(coords)[-1]
1131
+ if angle < -45:
1132
+ angle = -(90 + angle)
1133
+ else:
1134
+ angle = -angle
1135
+
1136
+ # Only deskew if angle is significant (> 0.5 degrees)
1137
+ if abs(angle) > 0.5:
1138
+ (h, w) = binary.shape
1139
+ center = (w // 2, h // 2)
1140
+ M = cv2.getRotationMatrix2D(center, angle, 1.0)
1141
+ binary = cv2.warpAffine(
1142
+ binary, M, (w, h),
1143
+ flags=cv2.INTER_CUBIC,
1144
+ borderMode=cv2.BORDER_REPLICATE
1145
+ )
1146
 
1147
+ # Convert back to RGB for PaddleOCR
1148
+ return cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
1149
+
1150
+
1151
+ # ==========================================
1152
+ # 🧠 IMPROVED LINE DETECTION WITH DBSCAN
1153
+ # ==========================================
1154
+ def cluster_boxes_into_lines(raw_boxes, log_data):
1155
+ """
1156
+ Uses DBSCAN clustering to intelligently group text boxes into lines.
1157
+ This handles irregular handwriting baselines much better than rule-based methods.
1158
+ """
1159
+ if raw_boxes is None or len(raw_boxes) == 0:
1160
+ return []
1161
+
1162
+ # 1. Convert PaddleOCR boxes to rectangles
1163
+ rects = []
1164
+ for box in raw_boxes:
1165
+ box = np.array(box).astype(np.float32)
1166
+ x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
1167
+ x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
1168
+ rects.append([x1, y1, x2, y2])
1169
+
1170
+ log_data.append(f"βœ“ Raw Detections: {len(rects)} boxes found.")
1171
+
1172
+ # 2. Filter out noise and very small boxes
1173
+ filtered_rects = []
1174
+ for rect in rects:
1175
+ w = rect[2] - rect[0]
1176
+ h = rect[3] - rect[1]
1177
+ if w > 15 and h > 10: # Minimum size threshold
1178
+ filtered_rects.append(rect)
1179
 
1180
+ rects = filtered_rects
1181
+ log_data.append(f"βœ“ After noise filtering: {len(rects)} boxes remain.")
1182
 
1183
+ if len(rects) == 0:
1184
+ return []
1185
+
1186
+ # 3. Remove nested/overlapping boxes
1187
+ rects = filter_nested_boxes(rects)
1188
+ log_data.append(f"βœ“ After removing nested boxes: {len(rects)} boxes remain.")
1189
+
1190
+ # 4. DBSCAN clustering by Y-coordinate
1191
+ # Extract y-centers for clustering
1192
+ y_centers = np.array([(r[1] + r[3]) / 2 for r in rects])
1193
+
1194
+ # Calculate adaptive epsilon based on median box height
1195
+ heights = np.array([r[3] - r[1] for r in rects])
1196
+ median_height = np.median(heights)
1197
+
1198
+ # Epsilon: 60% of median height works well for handwriting
1199
+ eps = median_height * 0.6
1200
+
1201
+ log_data.append(f"βœ“ Clustering parameters: median_height={median_height:.1f}px, eps={eps:.1f}px")
1202
+
1203
+ # Perform clustering
1204
+ clustering = DBSCAN(eps=eps, min_samples=1, metric='euclidean')
1205
+ labels = clustering.fit_predict(y_centers.reshape(-1, 1))
1206
+
1207
+ log_data.append(f"βœ“ DBSCAN found {len(set(labels))} text lines.")
1208
+
1209
+ # 5. Group boxes by cluster labels
1210
+ lines = []
1211
+ for label in set(labels):
1212
+ # Get all boxes in this cluster
1213
+ line_boxes = [rects[i] for i, l in enumerate(labels) if l == label]
1214
+
1215
+ # Sort boxes left-to-right within the line
1216
+ line_boxes.sort(key=lambda b: b[0])
1217
+
1218
+ # Merge into a single bounding box for the entire line
1219
+ x1 = min(b[0] for b in line_boxes)
1220
+ y1 = min(b[1] for b in line_boxes)
1221
+ x2 = max(b[2] for b in line_boxes)
1222
+ y2 = max(b[3] for b in line_boxes)
1223
+
1224
+ lines.append([x1, y1, x2, y2])
1225
+
1226
+ # Sort lines top-to-bottom
1227
+ lines.sort(key=lambda r: r[1])
1228
+
1229
+ log_data.append(f"βœ“ Final merged lines: {len(lines)} lines created.\n")
1230
+
1231
+ return lines
1232
+
1233
 
1234
  def filter_nested_boxes(boxes, containment_thresh=0.85):
1235
  """
1236
  Removes boxes that are mostly contained within other larger boxes.
1237
+ This prevents duplicate detections.
1238
  """
1239
+ if not boxes:
1240
+ return []
1241
 
1242
+ # Add area to each box
1243
+ boxes_with_area = []
1244
  for b in boxes:
1245
  area = (b[2] - b[0]) * (b[3] - b[1])
1246
+ boxes_with_area.append(list(b) + [area])
1247
 
1248
+ # Sort by area (largest first)
1249
+ boxes_with_area.sort(key=lambda x: x[4], reverse=True)
1250
 
1251
  final_boxes = []
1252
 
1253
+ for current in boxes_with_area:
1254
  is_nested = False
1255
  curr_box = current[:4]
1256
 
1257
+ # Check if this box is contained within any already-kept box
1258
  for kept in final_boxes:
1259
  overlap_ratio = calculate_iou_containment(curr_box, kept)
1260
 
 
1268
  return final_boxes
1269
 
1270
 
1271
+ def calculate_iou_containment(box1, box2):
 
 
 
1272
  """
1273
+ Calculates how much of box1 is inside box2.
1274
+ Returns: ratio (0.0 to 1.0)
1275
  """
1276
+ x1 = max(box1[0], box2[0])
1277
+ y1 = max(box1[1], box2[1])
1278
+ x2 = min(box1[2], box2[2])
1279
+ y2 = min(box1[3], box2[3])
1280
+
1281
+ if x2 < x1 or y2 < y1:
1282
+ return 0.0
1283
+
1284
+ intersection = (x2 - x1) * (y2 - y1)
1285
+ area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
1286
+
1287
+ if area1 == 0:
1288
+ return 0.0
1289
+
1290
+ return intersection / area1
 
1291
 
 
 
1292
 
1293
+ # ==========================================
1294
+ # 🧠 ENHANCED TEXT RECOGNITION
1295
+ # ==========================================
1296
+ def recognize_text_batch(crops, batch_size=4):
1297
+ """
1298
+ Process multiple crops in batches for better efficiency.
1299
+ """
1300
+ results = []
1301
 
1302
+ for i in range(0, len(crops), batch_size):
1303
+ batch_crops = crops[i:i+batch_size]
 
 
 
 
 
1304
 
1305
+ with torch.no_grad():
1306
+ pixel_values = processor(
1307
+ images=batch_crops,
1308
+ return_tensors="pt"
1309
+ ).pixel_values.to(device)
 
 
 
1310
 
1311
+ generated_ids = model.generate(
1312
+ pixel_values,
1313
+ max_length=64,
1314
+ num_beams=4, # Beam search for better quality
1315
+ early_stopping=True
1316
+ )
1317
+
1318
+ texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
1319
+ results.extend(texts)
 
 
 
 
 
 
 
 
 
 
 
1320
 
1321
+ return results
 
1322
 
1323
 
1324
+ # ==========================================
1325
+ # 🎯 MAIN PROCESSING FUNCTION
1326
+ # ==========================================
1327
+ def process_image(image, use_preprocessing=True):
1328
+ """
1329
+ Main OCR pipeline with optional preprocessing.
1330
+ """
1331
+ logs = []
1332
 
1333
  if image is None:
1334
+ return None, [], "⚠️ Please upload an image.", "No logs."
1335
+
1336
+ logs.append("=" * 50)
1337
+ logs.append("πŸš€ STARTING OCR PIPELINE")
1338
+ logs.append("=" * 50 + "\n")
1339
 
1340
+ # Convert to numpy array
1341
  image_np = np.array(image.convert("RGB"))
1342
+ original_image = image_np.copy()
1343
+
1344
+ # Step 1: Preprocessing
1345
+ if use_preprocessing:
1346
+ logs.append("πŸ“ Step 1: Preprocessing image for handwriting...")
1347
+ preprocessed = preprocess_for_handwriting(image_np)
1348
+ logs.append("βœ“ Preprocessing complete.\n")
1349
+ else:
1350
+ preprocessed = image_np
1351
+ logs.append("πŸ“ Step 1: Skipping preprocessing (disabled).\n")
1352
+
1353
+ # Step 2: Text Detection
1354
+ logs.append("πŸ“ Step 2: Detecting text regions...")
1355
  try:
1356
+ dt_boxes, _ = detector.text_detector(preprocessed)
1357
  except Exception as e:
1358
+ error_msg = f"❌ Detection Error: {str(e)}"
1359
+ logs.append(error_msg)
1360
+ return image, [], error_msg, "\n".join(logs)
1361
 
1362
  if dt_boxes is None or len(dt_boxes) == 0:
1363
+ error_msg = "⚠️ No text detected in the image."
1364
+ logs.append(error_msg)
1365
+ return image, [], error_msg, "\n".join(logs)
1366
 
1367
+ # Step 3: Line Clustering
1368
+ logs.append("\nπŸ“ Step 3: Clustering text boxes into lines...")
1369
+ line_boxes = cluster_boxes_into_lines(dt_boxes, logs)
1370
 
1371
+ if len(line_boxes) == 0:
1372
+ error_msg = "⚠️ No valid text lines found after filtering."
1373
+ logs.append(error_msg)
1374
+ return image, [], error_msg, "\n".join(logs)
1375
 
1376
+ # Step 4: Extract and Recognize
1377
+ logs.append("πŸ“ Step 4: Extracting and recognizing text...\n")
1378
+ logs.append("-" * 50)
1379
+
1380
+ annotated_img = original_image.copy()
1381
+ debug_crops = []
1382
+ crop_images = []
1383
 
1384
  for i, box in enumerate(line_boxes):
1385
  x1, y1, x2, y2 = map(int, box)
1386
 
1387
+ logs.append(f"Line {i+1}: [{x1}, {y1}, {x2}, {y2}] (w={x2-x1}, h={y2-y1})")
1388
 
1389
+ # Draw bounding box on visualization
1390
+ color = (0, 255, 0) # Green
1391
+ cv2.rectangle(annotated_img, (x1, y1), (x2, y2), color, 2)
1392
+ cv2.putText(annotated_img, f"L{i+1}", (x1, y1-5),
1393
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
 
 
1394
 
1395
+ # Add padding for better recognition
1396
  PAD = 10
1397
+ h, w, _ = original_image.shape
1398
+ x1_pad = max(0, x1 - PAD)
1399
+ y1_pad = max(0, y1 - PAD)
1400
+ x2_pad = min(w, x2 + PAD)
1401
+ y2_pad = min(h, y2 + PAD)
1402
 
1403
+ # Crop the line
1404
+ crop = original_image[y1_pad:y2_pad, x1_pad:x2_pad]
1405
  pil_crop = Image.fromarray(crop)
1406
+ crop_images.append(pil_crop)
1407
  debug_crops.append(pil_crop)
1408
+
1409
+ logs.append("-" * 50)
1410
+ logs.append(f"\nπŸ“ Step 5: Running OCR on {len(crop_images)} line crops...")
1411
+
1412
+ # Batch recognition
1413
+ recognized_texts = recognize_text_batch(crop_images, batch_size=4)
1414
+
1415
+ # Filter and log results
1416
+ results = []
1417
+ logs.append("\n" + "=" * 50)
1418
+ logs.append("πŸ“„ RECOGNITION RESULTS")
1419
+ logs.append("=" * 50 + "\n")
1420
+
1421
+ for i, text in enumerate(recognized_texts):
1422
+ text = text.strip()
1423
+ if text:
1424
+ results.append(text)
1425
+ logs.append(f"Line {i+1}: {text}")
1426
+ else:
1427
+ logs.append(f"Line {i+1}: [empty]")
1428
+
1429
+ # Final output
1430
  full_text = "\n".join(results)
1431
+
1432
+ logs.append("\n" + "=" * 50)
1433
+ logs.append(f"βœ… COMPLETE: {len(results)} lines transcribed.")
1434
+ logs.append("=" * 50)
1435
+
1436
  return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
1437
 
1438
+
1439
+ # ==========================================
1440
+ # 🎨 GRADIO UI
1441
+ # ==========================================
1442
+ with gr.Blocks(theme=gr.themes.Soft(), title="Advanced OCR with DBSCAN") as demo:
1443
+ gr.Markdown("""
1444
+ # πŸ”¬ Advanced Handwriting OCR with DBSCAN Clustering
1445
+
1446
+ **Improvements:**
1447
+ - 🎯 DBSCAN clustering for intelligent line detection
1448
+ - πŸ” TrOCR-Large model for better accuracy
1449
+ - πŸ–ΌοΈ Preprocessing pipeline for handwriting
1450
+ - ⚑ Batch processing for efficiency
1451
+ - πŸ“Š Detailed debug logs
1452
+ """)
1453
 
1454
  with gr.Row():
1455
  with gr.Column(scale=1):
1456
+ input_img = gr.Image(type="pil", label="πŸ“€ Upload Handwritten Image")
1457
+
1458
+ with gr.Accordion("βš™οΈ Options", open=False):
1459
+ use_preprocess = gr.Checkbox(
1460
+ label="Enable preprocessing (denoising, deskewing)",
1461
+ value=True,
1462
+ info="Recommended for photos and low-quality scans"
1463
+ )
1464
+
1465
+ btn = gr.Button("πŸš€ Transcribe", variant="primary", size="lg")
1466
 
1467
  with gr.Column(scale=1):
1468
  with gr.Tabs():
1469
+ with gr.Tab("πŸ–ΌοΈ Visualization"):
1470
  output_img = gr.Image(label="Detected Lines")
1471
+ gr.Markdown("*Green boxes show detected text lines with line numbers*")
1472
+
1473
+ with gr.Tab("πŸ“ Extracted Text"):
1474
+ output_txt = gr.Textbox(
1475
+ label="Recognized Text",
1476
+ lines=15,
1477
+ show_copy_button=True,
1478
+ placeholder="Transcribed text will appear here..."
1479
+ )
1480
+
1481
+ with gr.Tab("πŸ” Debug Logs"):
1482
+ log_output = gr.Textbox(
1483
+ label="Processing Logs",
1484
+ lines=20,
1485
+ interactive=False
1486
+ )
1487
+
1488
  with gr.Row():
1489
+ gallery = gr.Gallery(
1490
+ label="πŸ“Έ Line Crops (For Debugging)",
1491
+ columns=4,
1492
+ height=200,
1493
+ object_fit="contain"
1494
+ )
1495
+
1496
+ gr.Markdown("""
1497
+ ---
1498
+ ### πŸ’‘ Tips for Best Results:
1499
+ - Upload clear, high-contrast images
1500
+ - Ensure text is not too small (minimum 15px height)
1501
+ - Try enabling/disabling preprocessing based on your image quality
1502
+ - Check debug logs if results are unexpected
1503
+ """)
1504
+
1505
+ # Connect button to processing function
1506
+ btn.click(
1507
+ fn=process_image,
1508
+ inputs=[input_img, use_preprocess],
1509
+ outputs=[output_img, gallery, output_txt, log_output]
1510
+ )
1511
 
1512
  if __name__ == "__main__":
1513
  demo.launch()
1514
 
1515
 
1516