iammraat commited on
Commit
2e99bc0
·
verified ·
1 Parent(s): 204a6d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -618
app.py CHANGED
@@ -816,257 +816,6 @@
816
 
817
 
818
 
819
- # import gradio as gr
820
- # import torch
821
- # import numpy as np
822
- # import cv2
823
- # from PIL import Image
824
- # from transformers import TrOCRProcessor, VisionEncoderDecoderModel
825
- # from paddleocr import PaddleOCR
826
- # import pandas as pd
827
-
828
- # # --- 1. SETUP TR-OCR ---
829
- # device = "cuda" if torch.cuda.is_available() else "cpu"
830
- # print(f"Loading TrOCR on {device}...")
831
- # processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
832
- # model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
833
-
834
- # # --- 2. SETUP PADDLEOCR ---
835
- # print("Loading PaddleOCR...")
836
- # # High resolution settings to detect faint text
837
- # detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
838
- # det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
839
-
840
-
841
- # # ==========================================
842
- # # 🧠 LOGIC: INTERSECTION OVER UNION (IOU)
843
- # # ==========================================
844
- # def calculate_iou_containment(box1, box2):
845
- # """
846
- # Calculates how much of box1 is inside box2.
847
- # Returns: ratio (0.0 to 1.0)
848
- # """
849
- # x1 = max(box1[0], box2[0])
850
- # y1 = max(box1[1], box2[1])
851
- # x2 = min(box1[2], box2[2])
852
- # y2 = min(box1[3], box2[3])
853
-
854
- # if x2 < x1 or y2 < y1:
855
- # return 0.0
856
-
857
- # intersection = (x2 - x1) * (y2 - y1)
858
- # area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
859
-
860
- # return intersection / area1
861
-
862
- # def filter_nested_boxes(boxes, containment_thresh=0.85):
863
- # """
864
- # Removes boxes that are mostly contained within other larger boxes.
865
- # """
866
- # if not boxes: return []
867
-
868
- # # [x1, y1, x2, y2, area]
869
- # active = []
870
- # for b in boxes:
871
- # area = (b[2] - b[0]) * (b[3] - b[1])
872
- # active.append(list(b) + [area])
873
-
874
- # # Sort by Area descending (Biggest first)
875
- # active.sort(key=lambda x: x[4], reverse=True)
876
-
877
- # final_boxes = []
878
-
879
- # for current in active:
880
- # is_nested = False
881
- # curr_box = current[:4]
882
-
883
- # # Check if this box is inside any bigger box we already kept
884
- # for kept in final_boxes:
885
- # overlap_ratio = calculate_iou_containment(curr_box, kept)
886
-
887
- # if overlap_ratio > containment_thresh:
888
- # is_nested = True
889
- # break
890
-
891
- # if not is_nested:
892
- # final_boxes.append(curr_box)
893
-
894
- # return final_boxes
895
-
896
-
897
- # # ==========================================
898
- # # 🧠 LOGIC: STRICT LINE MERGING
899
- # # ==========================================
900
- # def merge_boxes_into_lines(raw_boxes, log_data):
901
- # """
902
- # Merges boxes horizontally but prevents vertical merging.
903
- # """
904
- # if raw_boxes is None or len(raw_boxes) == 0:
905
- # return []
906
-
907
- # # 1. Convert to Rects
908
- # rects = []
909
- # for box in raw_boxes:
910
- # box = np.array(box).astype(np.float32)
911
- # x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
912
- # x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
913
- # rects.append([x1, y1, x2, y2])
914
-
915
- # log_data.append(f"Raw Detections: {len(rects)} boxes found.")
916
-
917
- # # 2. Filter Nested
918
- # rects = filter_nested_boxes(rects)
919
- # log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.")
920
-
921
- # # 3. Sort by Y-Center (Top to Bottom)
922
- # rects.sort(key=lambda r: (r[1] + r[3]) / 2)
923
-
924
- # lines = []
925
-
926
- # while rects:
927
- # # Start a new line with the highest remaining box
928
- # current_line = [rects.pop(0)]
929
-
930
- # # Calculate the dynamic "height" of this line based on the first word
931
- # ref_h = current_line[0][3] - current_line[0][1]
932
- # ref_y_center = (current_line[0][1] + current_line[0][3]) / 2
933
-
934
- # # Look for other words on this SAME line
935
- # # STRICT RULE: A box is on the same line ONLY if its Y-center
936
- # # is within 50% of the reference box's height.
937
- # vertical_tolerance = ref_h * 0.5
938
-
939
- # remaining_rects = []
940
- # for r in rects:
941
- # r_y_center = (r[1] + r[3]) / 2
942
-
943
- # if abs(r_y_center - ref_y_center) < vertical_tolerance:
944
- # current_line.append(r)
945
- # else:
946
- # remaining_rects.append(r)
947
-
948
- # rects = remaining_rects
949
-
950
- # # Sort words in this line left-to-right
951
- # current_line.sort(key=lambda r: r[0])
952
-
953
- # # 4. Merge the horizontal group into ONE box
954
- # lx1 = min(r[0] for r in current_line)
955
- # ly1 = min(r[1] for r in current_line)
956
- # lx2 = max(r[2] for r in current_line)
957
- # ly2 = max(r[3] for r in current_line)
958
-
959
- # lines.append([lx1, ly1, lx2, ly2])
960
-
961
- # # Final Sort by Y
962
- # lines.sort(key=lambda r: r[1])
963
-
964
- # log_data.append(f"Final Merged Lines: {len(lines)} lines created.")
965
- # return lines
966
-
967
-
968
- # def process_image(image):
969
- # logs = [] # Store debug messages here
970
-
971
- # if image is None:
972
- # return None, [], "Please upload an image.", "No logs."
973
-
974
- # image_np = np.array(image.convert("RGB"))
975
-
976
- # # DETECT
977
- # try:
978
- # dt_boxes, _ = detector.text_detector(image_np)
979
- # except Exception as e:
980
- # return image, [], f"Detection Error: {str(e)}", "\n".join(logs)
981
-
982
- # if dt_boxes is None or len(dt_boxes) == 0:
983
- # return image, [], "No text detected.", "\n".join(logs)
984
-
985
- # # PROCESS
986
- # line_boxes = merge_boxes_into_lines(dt_boxes, logs)
987
-
988
- # annotated_img = image_np.copy()
989
- # results = []
990
- # debug_crops = []
991
-
992
- # # Log the final box coordinates for inspection
993
- # logs.append("\n--- Final Box Coordinates ---")
994
-
995
- # for i, box in enumerate(line_boxes):
996
- # x1, y1, x2, y2 = map(int, box)
997
-
998
- # logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}")
999
-
1000
- # # Filter Noise
1001
- # if (x2 - x1) < 20 or (y2 - y1) < 15:
1002
- # logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)")
1003
- # continue
1004
-
1005
- # # Draw (Green)
1006
- # cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
1007
-
1008
- # # PADDING
1009
- # PAD = 10
1010
- # h, w, _ = image_np.shape
1011
- # x1 = max(0, x1 - PAD)
1012
- # y1 = max(0, y1 - PAD)
1013
- # x2 = min(w, x2 + PAD)
1014
- # y2 = min(h, y2 + PAD)
1015
-
1016
- # crop = image_np[y1:y2, x1:x2]
1017
- # pil_crop = Image.fromarray(crop)
1018
- # debug_crops.append(pil_crop)
1019
-
1020
- # # RECOGNIZE
1021
- # with torch.no_grad():
1022
- # pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
1023
- # generated_ids = model.generate(pixel_values)
1024
- # text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1025
- # if text.strip():
1026
- # results.append(text)
1027
-
1028
- # full_text = "\n".join(results)
1029
- # return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
1030
-
1031
- # # --- UI ---
1032
- # with gr.Blocks(theme=gr.themes.Soft()) as demo:
1033
- # gr.Markdown("# ⚡ Smart Line-Level OCR (Debug Mode)")
1034
-
1035
- # with gr.Row():
1036
- # with gr.Column(scale=1):
1037
- # input_img = gr.Image(type="pil", label="Upload Image")
1038
- # btn = gr.Button("Transcribe", variant="primary")
1039
-
1040
- # with gr.Column(scale=1):
1041
- # with gr.Tabs():
1042
- # with gr.Tab("Visualization"):
1043
- # output_img = gr.Image(label="Detected Lines")
1044
- # with gr.Tab("Extracted Text"):
1045
- # output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
1046
- # with gr.Tab("Debug Logs"):
1047
- # # CHANGED HERE: Uses Textbox instead of Code to avoid version errors
1048
- # log_output = gr.Textbox(label="Processing Logs", lines=20, interactive=False)
1049
-
1050
- # with gr.Row():
1051
- # gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)
1052
-
1053
- # btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output])
1054
-
1055
- # if __name__ == "__main__":
1056
- # demo.launch()
1057
-
1058
-
1059
-
1060
-
1061
-
1062
-
1063
-
1064
-
1065
-
1066
-
1067
-
1068
-
1069
-
1070
  import gradio as gr
1071
  import torch
1072
  import numpy as np
@@ -1074,192 +823,64 @@ import cv2
1074
  from PIL import Image
1075
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
1076
  from paddleocr import PaddleOCR
1077
- from sklearn.cluster import DBSCAN
1078
- from scipy.spatial.distance import pdist, squareform
1079
- import warnings
1080
- warnings.filterwarnings('ignore')
1081
 
1082
- # ==========================================
1083
- # 🚀 SETUP MODELS
1084
- # ==========================================
1085
  device = "cuda" if torch.cuda.is_available() else "cpu"
1086
  print(f"Loading TrOCR on {device}...")
1087
-
1088
- # Upgraded to TrOCR-Large for better accuracy
1089
  processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
1090
  model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
1091
 
 
1092
  print("Loading PaddleOCR...")
1093
- # Optimized settings for handwriting detection
1094
- detector = PaddleOCR(
1095
- use_angle_cls=True,
1096
- lang='en',
1097
- show_log=False,
1098
- det_limit_side_len=2500, # High resolution
1099
- det_db_thresh=0.2, # More sensitive threshold
1100
- det_db_box_thresh=0.4, # Better box filtering
1101
- det_db_unclip_ratio=1.8 # Larger text regions for handwriting
1102
- )
1103
 
1104
 
1105
  # ==========================================
1106
- # 🧠 PREPROCESSING FOR HANDWRITING
1107
  # ==========================================
1108
- def preprocess_for_handwriting(image_np):
1109
- """
1110
- Enhanced preprocessing specifically for handwriting.
1111
- Returns preprocessed image for better detection.
1112
- """
1113
- # Convert to grayscale
1114
- gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
1115
-
1116
- # Apply bilateral filter to reduce noise while preserving edges
1117
- denoised = cv2.bilateralFilter(gray, 9, 75, 75)
1118
-
1119
- # Adaptive thresholding (better for varying lighting)
1120
- binary = cv2.adaptiveThreshold(
1121
- denoised, 255,
1122
- cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
1123
- cv2.THRESH_BINARY,
1124
- 15, 10
1125
- )
1126
-
1127
- # Optional: Deskew the image
1128
- coords = np.column_stack(np.where(binary > 0))
1129
- if len(coords) > 0:
1130
- angle = cv2.minAreaRect(coords)[-1]
1131
- if angle < -45:
1132
- angle = -(90 + angle)
1133
- else:
1134
- angle = -angle
1135
-
1136
- # Only deskew if angle is significant (> 0.5 degrees)
1137
- if abs(angle) > 0.5:
1138
- (h, w) = binary.shape
1139
- center = (w // 2, h // 2)
1140
- M = cv2.getRotationMatrix2D(center, angle, 1.0)
1141
- binary = cv2.warpAffine(
1142
- binary, M, (w, h),
1143
- flags=cv2.INTER_CUBIC,
1144
- borderMode=cv2.BORDER_REPLICATE
1145
- )
1146
-
1147
- # Convert back to RGB for PaddleOCR
1148
- return cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
1149
-
1150
-
1151
- # ==========================================
1152
- # 🧠 IMPROVED LINE DETECTION WITH DBSCAN
1153
- # ==========================================
1154
- def cluster_boxes_into_lines(raw_boxes, log_data, eps_multiplier=0.35):
1155
  """
1156
- Uses DBSCAN clustering to intelligently group text boxes into lines.
1157
- This handles irregular handwriting baselines much better than rule-based methods.
1158
-
1159
- Args:
1160
- eps_multiplier: Controls clustering sensitivity (lower = stricter line separation)
1161
- Default 0.35 prevents multi-line merging
1162
  """
1163
- if raw_boxes is None or len(raw_boxes) == 0:
1164
- return []
1165
-
1166
- # 1. Convert PaddleOCR boxes to rectangles
1167
- rects = []
1168
- for box in raw_boxes:
1169
- box = np.array(box).astype(np.float32)
1170
- x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
1171
- x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
1172
- rects.append([x1, y1, x2, y2])
1173
-
1174
- log_data.append(f"✓ Raw Detections: {len(rects)} boxes found.")
1175
-
1176
- # 2. Filter out noise and very small boxes
1177
- filtered_rects = []
1178
- for rect in rects:
1179
- w = rect[2] - rect[0]
1180
- h = rect[3] - rect[1]
1181
- if w > 15 and h > 10: # Minimum size threshold
1182
- filtered_rects.append(rect)
1183
-
1184
- rects = filtered_rects
1185
- log_data.append(f"✓ After noise filtering: {len(rects)} boxes remain.")
1186
-
1187
- if len(rects) == 0:
1188
- return []
1189
-
1190
- # 3. Remove nested/overlapping boxes
1191
- rects = filter_nested_boxes(rects)
1192
- log_data.append(f"✓ After removing nested boxes: {len(rects)} boxes remain.")
1193
-
1194
- # 4. DBSCAN clustering by Y-coordinate
1195
- # Extract y-centers for clustering
1196
- y_centers = np.array([(r[1] + r[3]) / 2 for r in rects])
1197
-
1198
- # Calculate adaptive epsilon based on median box height
1199
- heights = np.array([r[3] - r[1] for r in rects])
1200
- median_height = np.median(heights)
1201
-
1202
- # CRITICAL FIX: Lower multiplier to prevent multi-line merging
1203
- # 0.35 = strict line separation, 0.6 = more permissive (old value)
1204
- eps = median_height * eps_multiplier
1205
-
1206
- log_data.append(f"✓ Clustering parameters: median_height={median_height:.1f}px, eps={eps:.1f}px (multiplier={eps_multiplier})")
1207
-
1208
- # Perform clustering
1209
- clustering = DBSCAN(eps=eps, min_samples=1, metric='euclidean')
1210
- labels = clustering.fit_predict(y_centers.reshape(-1, 1))
1211
-
1212
- log_data.append(f"✓ DBSCAN found {len(set(labels))} text lines.")
1213
-
1214
- # 5. Group boxes by cluster labels
1215
- lines = []
1216
- for label in set(labels):
1217
- # Get all boxes in this cluster
1218
- line_boxes = [rects[i] for i, l in enumerate(labels) if l == label]
1219
-
1220
- # Sort boxes left-to-right within the line
1221
- line_boxes.sort(key=lambda b: b[0])
1222
-
1223
- # Merge into a single bounding box for the entire line
1224
- x1 = min(b[0] for b in line_boxes)
1225
- y1 = min(b[1] for b in line_boxes)
1226
- x2 = max(b[2] for b in line_boxes)
1227
- y2 = max(b[3] for b in line_boxes)
1228
-
1229
- lines.append([x1, y1, x2, y2])
1230
 
1231
- # Sort lines top-to-bottom
1232
- lines.sort(key=lambda r: r[1])
1233
 
1234
- log_data.append(f"✓ Final merged lines: {len(lines)} lines created.\n")
 
1235
 
1236
- return lines
1237
-
1238
 
1239
  def filter_nested_boxes(boxes, containment_thresh=0.85):
1240
  """
1241
  Removes boxes that are mostly contained within other larger boxes.
1242
- This prevents duplicate detections.
1243
  """
1244
- if not boxes:
1245
- return []
1246
 
1247
- # Add area to each box
1248
- boxes_with_area = []
1249
  for b in boxes:
1250
  area = (b[2] - b[0]) * (b[3] - b[1])
1251
- boxes_with_area.append(list(b) + [area])
1252
 
1253
- # Sort by area (largest first)
1254
- boxes_with_area.sort(key=lambda x: x[4], reverse=True)
1255
 
1256
  final_boxes = []
1257
 
1258
- for current in boxes_with_area:
1259
  is_nested = False
1260
  curr_box = current[:4]
1261
 
1262
- # Check if this box is contained within any already-kept box
1263
  for kept in final_boxes:
1264
  overlap_ratio = calculate_iou_containment(curr_box, kept)
1265
 
@@ -1273,262 +894,178 @@ def filter_nested_boxes(boxes, containment_thresh=0.85):
1273
  return final_boxes
1274
 
1275
 
1276
- def calculate_iou_containment(box1, box2):
1277
- """
1278
- Calculates how much of box1 is inside box2.
1279
- Returns: ratio (0.0 to 1.0)
1280
- """
1281
- x1 = max(box1[0], box2[0])
1282
- y1 = max(box1[1], box2[1])
1283
- x2 = min(box1[2], box2[2])
1284
- y2 = min(box1[3], box2[3])
1285
-
1286
- if x2 < x1 or y2 < y1:
1287
- return 0.0
1288
-
1289
- intersection = (x2 - x1) * (y2 - y1)
1290
- area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
1291
-
1292
- if area1 == 0:
1293
- return 0.0
1294
-
1295
- return intersection / area1
1296
-
1297
-
1298
  # ==========================================
1299
- # 🧠 ENHANCED TEXT RECOGNITION
1300
  # ==========================================
1301
- def recognize_text_batch(crops, batch_size=4):
1302
  """
1303
- Process multiple crops in batches for better efficiency.
1304
  """
1305
- results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1306
 
1307
- for i in range(0, len(crops), batch_size):
1308
- batch_crops = crops[i:i+batch_size]
 
1309
 
1310
- with torch.no_grad():
1311
- pixel_values = processor(
1312
- images=batch_crops,
1313
- return_tensors="pt"
1314
- ).pixel_values.to(device)
1315
-
1316
- generated_ids = model.generate(
1317
- pixel_values,
1318
- max_length=64,
1319
- num_beams=4, # Beam search for better quality
1320
- early_stopping=True
1321
- )
1322
 
1323
- texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
1324
- results.extend(texts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1325
 
1326
- return results
 
1327
 
1328
 
1329
- # ==========================================
1330
- # 🎯 MAIN PROCESSING FUNCTION
1331
- # ==========================================
1332
- def process_image(image, use_preprocessing=True, eps_multiplier=0.35):
1333
- """
1334
- Main OCR pipeline with optional preprocessing.
1335
-
1336
- Args:
1337
- image: Input PIL image
1338
- use_preprocessing: Whether to apply preprocessing
1339
- eps_multiplier: DBSCAN epsilon multiplier for line clustering
1340
- """
1341
- logs = []
1342
 
1343
  if image is None:
1344
- return None, [], "⚠️ Please upload an image.", "No logs."
1345
-
1346
- logs.append("=" * 50)
1347
- logs.append("🚀 STARTING OCR PIPELINE")
1348
- logs.append("=" * 50 + "\n")
1349
 
1350
- # Convert to numpy array
1351
  image_np = np.array(image.convert("RGB"))
1352
- original_image = image_np.copy()
1353
-
1354
- # Step 1: Preprocessing
1355
- if use_preprocessing:
1356
- logs.append("📝 Step 1: Preprocessing image for handwriting...")
1357
- preprocessed = preprocess_for_handwriting(image_np)
1358
- logs.append("✓ Preprocessing complete.\n")
1359
- else:
1360
- preprocessed = image_np
1361
- logs.append("📝 Step 1: Skipping preprocessing (disabled).\n")
1362
-
1363
- # Step 2: Text Detection
1364
- logs.append("📝 Step 2: Detecting text regions...")
1365
  try:
1366
- dt_boxes, _ = detector.text_detector(preprocessed)
1367
  except Exception as e:
1368
- error_msg = f"Detection Error: {str(e)}"
1369
- logs.append(error_msg)
1370
- return image, [], error_msg, "\n".join(logs)
1371
 
1372
  if dt_boxes is None or len(dt_boxes) == 0:
1373
- error_msg = "⚠️ No text detected in the image."
1374
- logs.append(error_msg)
1375
- return image, [], error_msg, "\n".join(logs)
1376
 
1377
- # Step 3: Line Clustering
1378
- logs.append("\n📝 Step 3: Clustering text boxes into lines...")
1379
- line_boxes = cluster_boxes_into_lines(dt_boxes, logs, eps_multiplier=eps_multiplier)
1380
 
1381
- if len(line_boxes) == 0:
1382
- error_msg = "⚠️ No valid text lines found after filtering."
1383
- logs.append(error_msg)
1384
- return image, [], error_msg, "\n".join(logs)
1385
-
1386
- # Step 4: Extract and Recognize
1387
- logs.append("📝 Step 4: Extracting and recognizing text...\n")
1388
- logs.append("-" * 50)
1389
-
1390
- annotated_img = original_image.copy()
1391
  debug_crops = []
1392
- crop_images = []
 
 
1393
 
1394
  for i, box in enumerate(line_boxes):
1395
  x1, y1, x2, y2 = map(int, box)
1396
 
1397
- logs.append(f"Line {i+1}: [{x1}, {y1}, {x2}, {y2}] (w={x2-x1}, h={y2-y1})")
1398
 
1399
- # Draw bounding box on visualization
1400
- color = (0, 255, 0) # Green
1401
- cv2.rectangle(annotated_img, (x1, y1), (x2, y2), color, 2)
1402
- cv2.putText(annotated_img, f"L{i+1}", (x1, y1-5),
1403
- cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
 
 
1404
 
1405
- # Add padding for better recognition
1406
  PAD = 10
1407
- h, w, _ = original_image.shape
1408
- x1_pad = max(0, x1 - PAD)
1409
- y1_pad = max(0, y1 - PAD)
1410
- x2_pad = min(w, x2 + PAD)
1411
- y2_pad = min(h, y2 + PAD)
1412
 
1413
- # Crop the line
1414
- crop = original_image[y1_pad:y2_pad, x1_pad:x2_pad]
1415
  pil_crop = Image.fromarray(crop)
1416
- crop_images.append(pil_crop)
1417
  debug_crops.append(pil_crop)
1418
-
1419
- logs.append("-" * 50)
1420
- logs.append(f"\n📝 Step 5: Running OCR on {len(crop_images)} line crops...")
1421
-
1422
- # Batch recognition
1423
- recognized_texts = recognize_text_batch(crop_images, batch_size=4)
1424
-
1425
- # Filter and log results
1426
- results = []
1427
- logs.append("\n" + "=" * 50)
1428
- logs.append("📄 RECOGNITION RESULTS")
1429
- logs.append("=" * 50 + "\n")
1430
-
1431
- for i, text in enumerate(recognized_texts):
1432
- text = text.strip()
1433
- if text:
1434
- results.append(text)
1435
- logs.append(f"Line {i+1}: {text}")
1436
- else:
1437
- logs.append(f"Line {i+1}: [empty]")
1438
-
1439
- # Final output
1440
  full_text = "\n".join(results)
1441
-
1442
- logs.append("\n" + "=" * 50)
1443
- logs.append(f"✅ COMPLETE: {len(results)} lines transcribed.")
1444
- logs.append("=" * 50)
1445
-
1446
  return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
1447
 
1448
-
1449
- # ==========================================
1450
- # 🎨 GRADIO UI
1451
- # ==========================================
1452
- with gr.Blocks(theme=gr.themes.Soft(), title="Advanced OCR with DBSCAN") as demo:
1453
- gr.Markdown("""
1454
- # 🔬 Advanced Handwriting OCR with DBSCAN Clustering
1455
-
1456
- **Improvements:**
1457
- - 🎯 DBSCAN clustering for intelligent line detection
1458
- - 🔍 TrOCR-Large model for better accuracy
1459
- - 🖼️ Preprocessing pipeline for handwriting
1460
- - ⚡ Batch processing for efficiency
1461
- - 📊 Detailed debug logs
1462
- """)
1463
 
1464
  with gr.Row():
1465
  with gr.Column(scale=1):
1466
- input_img = gr.Image(type="pil", label="📤 Upload Handwritten Image")
1467
-
1468
- with gr.Accordion("⚙️ Options", open=False):
1469
- use_preprocess = gr.Checkbox(
1470
- label="Enable preprocessing (denoising, deskewing)",
1471
- value=True,
1472
- info="Recommended for photos and low-quality scans"
1473
- )
1474
-
1475
- eps_slider = gr.Slider(
1476
- minimum=0.2,
1477
- maximum=0.8,
1478
- value=0.35,
1479
- step=0.05,
1480
- label="Line Separation Sensitivity",
1481
- info="Lower = stricter separation (0.35 recommended for tight handwriting)"
1482
- )
1483
-
1484
- btn = gr.Button("🚀 Transcribe", variant="primary", size="lg")
1485
 
1486
  with gr.Column(scale=1):
1487
  with gr.Tabs():
1488
- with gr.Tab("🖼️ Visualization"):
1489
  output_img = gr.Image(label="Detected Lines")
1490
- gr.Markdown("*Green boxes show detected text lines with line numbers*")
1491
-
1492
- with gr.Tab("📝 Extracted Text"):
1493
- output_txt = gr.Textbox(
1494
- label="Recognized Text",
1495
- lines=15,
1496
- show_copy_button=True,
1497
- placeholder="Transcribed text will appear here..."
1498
- )
1499
-
1500
- with gr.Tab("🔍 Debug Logs"):
1501
- log_output = gr.Textbox(
1502
- label="Processing Logs",
1503
- lines=20,
1504
- interactive=False
1505
- )
1506
-
1507
  with gr.Row():
1508
- gallery = gr.Gallery(
1509
- label="📸 Line Crops (For Debugging)",
1510
- columns=4,
1511
- height=200,
1512
- object_fit="contain"
1513
- )
1514
-
1515
- gr.Markdown("""
1516
- ---
1517
- ### 💡 Tips for Best Results:
1518
- - Upload clear, high-contrast images
1519
- - Ensure text is not too small (minimum 15px height)
1520
- - Try enabling/disabling preprocessing based on your image quality
1521
- - Check debug logs if results are unexpected
1522
- """)
1523
-
1524
- # Connect button to processing function
1525
- btn.click(
1526
- fn=process_image,
1527
- inputs=[input_img, use_preprocess, eps_slider],
1528
- outputs=[output_img, gallery, output_txt, log_output]
1529
- )
1530
 
1531
  if __name__ == "__main__":
1532
  demo.launch()
1533
 
1534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
 
817
 
818
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
819
  import gradio as gr
820
  import torch
821
  import numpy as np
 
823
  from PIL import Image
824
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
825
  from paddleocr import PaddleOCR
826
+ import pandas as pd
 
 
 
827
 
828
+ # --- 1. SETUP TR-OCR ---
 
 
829
  device = "cuda" if torch.cuda.is_available() else "cpu"
830
  print(f"Loading TrOCR on {device}...")
 
 
831
  processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
832
  model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
833
 
834
+ # --- 2. SETUP PADDLEOCR ---
835
  print("Loading PaddleOCR...")
836
+ # High resolution settings to detect faint text
837
+ detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
838
+ det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
 
 
 
 
 
 
 
839
 
840
 
841
  # ==========================================
842
+ # 🧠 LOGIC: INTERSECTION OVER UNION (IOU)
843
  # ==========================================
844
+ def calculate_iou_containment(box1, box2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
845
  """
846
+ Calculates how much of box1 is inside box2.
847
+ Returns: ratio (0.0 to 1.0)
 
 
 
 
848
  """
849
+ x1 = max(box1[0], box2[0])
850
+ y1 = max(box1[1], box2[1])
851
+ x2 = min(box1[2], box2[2])
852
+ y2 = min(box1[3], box2[3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
853
 
854
+ if x2 < x1 or y2 < y1:
855
+ return 0.0
856
 
857
+ intersection = (x2 - x1) * (y2 - y1)
858
+ area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
859
 
860
+ return intersection / area1
 
861
 
862
  def filter_nested_boxes(boxes, containment_thresh=0.85):
863
  """
864
  Removes boxes that are mostly contained within other larger boxes.
 
865
  """
866
+ if not boxes: return []
 
867
 
868
+ # [x1, y1, x2, y2, area]
869
+ active = []
870
  for b in boxes:
871
  area = (b[2] - b[0]) * (b[3] - b[1])
872
+ active.append(list(b) + [area])
873
 
874
+ # Sort by Area descending (Biggest first)
875
+ active.sort(key=lambda x: x[4], reverse=True)
876
 
877
  final_boxes = []
878
 
879
+ for current in active:
880
  is_nested = False
881
  curr_box = current[:4]
882
 
883
+ # Check if this box is inside any bigger box we already kept
884
  for kept in final_boxes:
885
  overlap_ratio = calculate_iou_containment(curr_box, kept)
886
 
 
894
  return final_boxes
895
 
896
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
897
  # ==========================================
898
+ # 🧠 LOGIC: STRICT LINE MERGING
899
  # ==========================================
900
+ def merge_boxes_into_lines(raw_boxes, log_data):
901
  """
902
+ Merges boxes horizontally but prevents vertical merging.
903
  """
904
+ if raw_boxes is None or len(raw_boxes) == 0:
905
+ return []
906
+
907
+ # 1. Convert to Rects
908
+ rects = []
909
+ for box in raw_boxes:
910
+ box = np.array(box).astype(np.float32)
911
+ x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
912
+ x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
913
+ rects.append([x1, y1, x2, y2])
914
+
915
+ log_data.append(f"Raw Detections: {len(rects)} boxes found.")
916
+
917
+ # 2. Filter Nested
918
+ rects = filter_nested_boxes(rects)
919
+ log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.")
920
+
921
+ # 3. Sort by Y-Center (Top to Bottom)
922
+ rects.sort(key=lambda r: (r[1] + r[3]) / 2)
923
+
924
+ lines = []
925
 
926
+ while rects:
927
+ # Start a new line with the highest remaining box
928
+ current_line = [rects.pop(0)]
929
 
930
+ # Calculate the dynamic "height" of this line based on the first word
931
+ ref_h = current_line[0][3] - current_line[0][1]
932
+ ref_y_center = (current_line[0][1] + current_line[0][3]) / 2
933
+
934
+ # Look for other words on this SAME line
935
+ # STRICT RULE: A box is on the same line ONLY if its Y-center
936
+ # is within 50% of the reference box's height.
937
+ vertical_tolerance = ref_h * 0.5
938
+
939
+ remaining_rects = []
940
+ for r in rects:
941
+ r_y_center = (r[1] + r[3]) / 2
942
 
943
+ if abs(r_y_center - ref_y_center) < vertical_tolerance:
944
+ current_line.append(r)
945
+ else:
946
+ remaining_rects.append(r)
947
+
948
+ rects = remaining_rects
949
+
950
+ # Sort words in this line left-to-right
951
+ current_line.sort(key=lambda r: r[0])
952
+
953
+ # 4. Merge the horizontal group into ONE box
954
+ lx1 = min(r[0] for r in current_line)
955
+ ly1 = min(r[1] for r in current_line)
956
+ lx2 = max(r[2] for r in current_line)
957
+ ly2 = max(r[3] for r in current_line)
958
+
959
+ lines.append([lx1, ly1, lx2, ly2])
960
+
961
+ # Final Sort by Y
962
+ lines.sort(key=lambda r: r[1])
963
 
964
+ log_data.append(f"Final Merged Lines: {len(lines)} lines created.")
965
+ return lines
966
 
967
 
968
+ def process_image(image):
969
+ logs = [] # Store debug messages here
 
 
 
 
 
 
 
 
 
 
 
970
 
971
  if image is None:
972
+ return None, [], "Please upload an image.", "No logs."
 
 
 
 
973
 
 
974
  image_np = np.array(image.convert("RGB"))
975
+
976
+ # DETECT
 
 
 
 
 
 
 
 
 
 
 
977
  try:
978
+ dt_boxes, _ = detector.text_detector(image_np)
979
  except Exception as e:
980
+ return image, [], f"Detection Error: {str(e)}", "\n".join(logs)
 
 
981
 
982
  if dt_boxes is None or len(dt_boxes) == 0:
983
+ return image, [], "No text detected.", "\n".join(logs)
 
 
984
 
985
+ # PROCESS
986
+ line_boxes = merge_boxes_into_lines(dt_boxes, logs)
 
987
 
988
+ annotated_img = image_np.copy()
989
+ results = []
 
 
 
 
 
 
 
 
990
  debug_crops = []
991
+
992
+ # Log the final box coordinates for inspection
993
+ logs.append("\n--- Final Box Coordinates ---")
994
 
995
  for i, box in enumerate(line_boxes):
996
  x1, y1, x2, y2 = map(int, box)
997
 
998
+ logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}")
999
 
1000
+ # Filter Noise
1001
+ if (x2 - x1) < 20 or (y2 - y1) < 15:
1002
+ logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)")
1003
+ continue
1004
+
1005
+ # Draw (Green)
1006
+ cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
1007
 
1008
+ # PADDING
1009
  PAD = 10
1010
+ h, w, _ = image_np.shape
1011
+ x1 = max(0, x1 - PAD)
1012
+ y1 = max(0, y1 - PAD)
1013
+ x2 = min(w, x2 + PAD)
1014
+ y2 = min(h, y2 + PAD)
1015
 
1016
+ crop = image_np[y1:y2, x1:x2]
 
1017
  pil_crop = Image.fromarray(crop)
 
1018
  debug_crops.append(pil_crop)
1019
+
1020
+ # RECOGNIZE
1021
+ with torch.no_grad():
1022
+ pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
1023
+ generated_ids = model.generate(pixel_values)
1024
+ text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
1025
+ if text.strip():
1026
+ results.append(text)
1027
+
 
 
 
 
 
 
 
 
 
 
 
 
 
1028
  full_text = "\n".join(results)
 
 
 
 
 
1029
  return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
1030
 
1031
+ # --- UI ---
1032
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
1033
+ gr.Markdown("# Smart Line-Level OCR (Debug Mode)")
 
 
 
 
 
 
 
 
 
 
 
 
1034
 
1035
  with gr.Row():
1036
  with gr.Column(scale=1):
1037
+ input_img = gr.Image(type="pil", label="Upload Image")
1038
+ btn = gr.Button("Transcribe", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1039
 
1040
  with gr.Column(scale=1):
1041
  with gr.Tabs():
1042
+ with gr.Tab("Visualization"):
1043
  output_img = gr.Image(label="Detected Lines")
1044
+ with gr.Tab("Extracted Text"):
1045
+ output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
1046
+ with gr.Tab("Debug Logs"):
1047
+ # CHANGED HERE: Uses Textbox instead of Code to avoid version errors
1048
+ log_output = gr.Textbox(label="Processing Logs", lines=20, interactive=False)
1049
+
 
 
 
 
 
 
 
 
 
 
 
1050
  with gr.Row():
1051
+ gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)
1052
+
1053
+ btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1054
 
1055
  if __name__ == "__main__":
1056
  demo.launch()
1057
 
1058
 
1059
+
1060
+
1061
+
1062
+
1063
+
1064
+
1065
+
1066
+
1067
+
1068
+
1069
+
1070
+
1071
+