# import gradio as gr # from transformers import TrOCRProcessor, VisionEncoderDecoderModel # import torch # from PIL import Image # # --- Model Setup --- # # We load the model outside the inference function to cache it on startup # MODEL_ID = "microsoft/trocr-base-handwritten" # print(f"Loading {MODEL_ID}...") # processor = TrOCRProcessor.from_pretrained(MODEL_ID) # model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID) # # Check for GPU (Free Spaces are usually CPU-only, but this handles upgrades) # device = "cuda" if torch.cuda.is_available() else "cpu" # model.to(device) # print(f"Model loaded on device: {device}") # # --- Inference Function --- # def process_image(image): # if image is None: # return "Please upload an image." # try: # # 1. Convert to RGB (standardizes input) # image = image.convert("RGB") # # 2. Preprocess # pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device) # # 3. Generate text # generated_ids = model.generate(pixel_values) # generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # return generated_text # except Exception as e: # return f"Error: {str(e)}" # # --- Gradio Interface --- # # Using the Blocks API for a clean layout # with gr.Blocks(theme=gr.themes.Soft()) as demo: # gr.Markdown( # """ # # ✍️ Handwritten Text Recognition # Using Microsoft's **TrOCR Small** model. Upload a handwritten note to transcribe it. # """ # ) # with gr.Row(): # with gr.Column(): # input_img = gr.Image(type="pil", label="Upload Image") # submit_btn = gr.Button("Transcribe", variant="primary") # with gr.Column(): # output_text = gr.Textbox(label="Result", interactive=False) # # Examples help users test it immediately without uploading their own file # # (Uncomment the list below if you upload example images to your repo) # # gr.Examples(["sample1.jpg"], inputs=input_img) # submit_btn.click(fn=process_image, inputs=input_img, outputs=output_text) # # Launch for Spaces # if __name__ == "__main__": # demo.launch() # import gradio as gr # import torch # import numpy as np # import cv2 # from PIL import Image # from transformers import TrOCRProcessor, VisionEncoderDecoderModel # from craft_text_detector import Craft # # ========================================== # # 🔧 PATCH 1: Fix Torchvision Compatibility # # ========================================== # import torchvision.models.vgg # if not hasattr(torchvision.models.vgg, 'model_urls'): # torchvision.models.vgg.model_urls = { # 'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth' # } # # ========================================== # # 🔧 PATCH 2: The "Ratio Net" Logic Fix # # ========================================== # import craft_text_detector.craft_utils as craft_utils_module # def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=2): # if not polys: # return [] # adjusted = [] # for poly in polys: # if poly is None or len(poly) == 0: # continue # # Convert to numpy and reshape # p = np.array(poly).reshape(-1, 2) # # Scale correctly using ratio_net # p[:, 0] *= (ratio_w * ratio_net) # p[:, 1] *= (ratio_h * ratio_net) # adjusted.append(p) # return adjusted # craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates # # ========================================== # # --- 1. SETUP MODEL (Switched to BASE for stability) --- # device = "cuda" if torch.cuda.is_available() else "cpu" # print(f"Loading TrOCR-Base on {device}...") # # We use the 'base' model because 'small' hallucinates Wikipedia text on tight crops # MODEL_ID = "microsoft/trocr-base-handwritten" # processor = TrOCRProcessor.from_pretrained(MODEL_ID) # model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID).to(device).eval() # print("Loading CRAFT...") # craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda")) # # --- 2. HELPER FUNCTIONS --- # def get_sorted_boxes(boxes): # """Sorts boxes top-to-bottom (lines), then left-to-right.""" # if not boxes: return [] # items = [] # for box in boxes: # cy = np.mean(box[:, 1]) # cx = np.mean(box[:, 0]) # items.append((cy, cx, box)) # # Sort by line (approx 20px tolerance) then by column # items.sort(key=lambda x: (int(x[0] // 20), x[1])) # return [x[2] for x in items] # def process_image(image): # if image is None: # return None, [], "Please upload an image." # # Convert to standard RGB Numpy array # # We use the FULL resolution image (no resizing) to keep text sharp # image_np = np.array(image.convert("RGB")) # # 1. DETECT # # The patch ensures coordinates map perfectly to this full-res image # prediction = craft.detect_text(image_np) # boxes = prediction.get("boxes", []) # if not boxes: # return image, [], "No text detected." # sorted_boxes = get_sorted_boxes(boxes) # annotated_img = image_np.copy() # results = [] # debug_crops = [] # # 2. PROCESS BOXES # for box in sorted_boxes: # box_int = box.astype(np.int32) # # Draw the box (Visual verification) # cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 3) # # --- CROP WITH PADDING (Crucial Fix) --- # # TrOCR needs 'breathing room' or it hallucinates. # PADDING = 10 # x_min = max(0, np.min(box_int[:, 0]) - PADDING) # x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING) # y_min = max(0, np.min(box_int[:, 1]) - PADDING) # y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING) # # Skip noise # if (x_max - x_min) < 20 or (y_max - y_min) < 10: # continue # crop = image_np[y_min:y_max, x_min:x_max] # # Convert to PIL for Model # pil_crop = Image.fromarray(crop) # # Add to debug gallery so user can see what the model sees # debug_crops.append(pil_crop) # # 3. RECOGNIZE # with torch.no_grad(): # pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device) # generated_ids = model.generate(pixel_values) # text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # if text.strip(): # results.append(text) # full_text = "\n".join(results) # return Image.fromarray(annotated_img), debug_crops, full_text # # --- 3. GRADIO UI --- # with gr.Blocks(theme=gr.themes.Soft()) as demo: # gr.Markdown("# 📝 Robust Handwritten OCR (Base Model)") # gr.Markdown("Includes padding and a stronger model to prevent hallucinations.") # with gr.Row(): # with gr.Column(scale=1): # input_img = gr.Image(type="pil", label="Upload Image") # btn = gr.Button("Transcribe", variant="primary") # with gr.Column(scale=1): # output_img = gr.Image(label="Detections") # output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True) # with gr.Row(): # # Gallery to check if crops are valid or empty # crop_gallery = gr.Gallery(label="Debug: See what the model sees (Crops)", columns=6, height=200) # btn.click(process_image, input_img, [output_img, crop_gallery, output_txt]) # if __name__ == "__main__": # demo.launch() # import gradio as gr # import torch # import numpy as np # import cv2 # from PIL import Image # from transformers import TrOCRProcessor, VisionEncoderDecoderModel # from paddleocr import PaddleOCR # # --- 1. SETUP TR-OCR (Recognition) --- # device = "cuda" if torch.cuda.is_available() else "cpu" # print(f"Loading TrOCR on {device}...") # processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten') # model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval() # # --- 2. SETUP PADDLEOCR (Detection Only) --- # print("Loading PaddleOCR (DBNet)...") # # We load the detector but we will bypass the main .ocr() method to avoid bugs # detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False) # def get_sorted_boxes(boxes): # """Sorts boxes top-to-bottom (lines), then left-to-right.""" # if boxes is None or len(boxes) == 0: # return [] # items = [] # for box in boxes: # # Paddle returns boxes as numpy arrays or lists # box = np.array(box).astype(np.float32) # cy = np.mean(box[:, 1]) # cx = np.mean(box[:, 0]) # items.append((cy, cx, box)) # # Sort by Y (line tolerance 20px) then X # items.sort(key=lambda x: (int(x[0] // 20), x[1])) # return [x[2] for x in items] # def process_image(image): # if image is None: # return None, [], "Please upload an image." # # Convert to standard RGB Numpy array # image_np = np.array(image.convert("RGB")) # # ============================================================ # # 🔴 FIX: Direct Detection Bypass # # ============================================================ # # The standard 'detector.ocr()' method has a bug in the current # # version that crashes when checking "if not boxes". # # We call the internal 'text_detector' directly to skip that check. # try: # dt_boxes, _ = detector.text_detector(image_np) # except Exception as e: # return image, [], f"Detection Error: {str(e)}" # if dt_boxes is None or len(dt_boxes) == 0: # return image, [], "No text detected." # # dt_boxes is already a numpy array of coordinates # sorted_boxes = get_sorted_boxes(dt_boxes) # annotated_img = image_np.copy() # results = [] # debug_crops = [] # # Process Boxes # for box in sorted_boxes: # box_int = box.astype(np.int32) # # Draw Box (Red, thickness 2) # cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 2) # # Crop with Padding (Prevents TrOCR Hallucinations) # PADDING = 10 # x_min = max(0, np.min(box_int[:, 0]) - PADDING) # x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING) # y_min = max(0, np.min(box_int[:, 1]) - PADDING) # y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING) # # Skip noise # if (x_max - x_min) < 15 or (y_max - y_min) < 10: # continue # crop = image_np[y_min:y_max, x_min:x_max] # pil_crop = Image.fromarray(crop) # debug_crops.append(pil_crop) # # Recognition (TrOCR) # with torch.no_grad(): # pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device) # generated_ids = model.generate(pixel_values) # text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # if text.strip(): # results.append(text) # full_text = "\n".join(results) # return Image.fromarray(annotated_img), debug_crops, full_text # # --- UI --- # with gr.Blocks(theme=gr.themes.Soft()) as demo: # gr.Markdown("# ⚡ PaddleOCR + TrOCR (Robust)") # gr.Markdown("Using direct DBNet inference to avoid library bugs.") # with gr.Row(): # with gr.Column(scale=1): # input_img = gr.Image(type="pil", label="Upload Image") # btn = gr.Button("Transcribe", variant="primary") # with gr.Column(scale=1): # output_img = gr.Image(label="Detections (Paddle)") # output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True) # with gr.Row(): # gallery = gr.Gallery(label="Line Crops (Debug)", columns=6, height=200) # btn.click(process_image, input_img, [output_img, gallery, output_txt]) # if __name__ == "__main__": # demo.launch() # import gradio as gr # import torch # import numpy as np # import cv2 # from PIL import Image # from transformers import TrOCRProcessor, VisionEncoderDecoderModel # from paddleocr import PaddleOCR # # --- 1. SETUP TR-OCR --- # device = "cuda" if torch.cuda.is_available() else "cpu" # print(f"Loading TrOCR on {device}...") # processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten') # model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval() # # --- 2. SETUP PADDLEOCR --- # print("Loading PaddleOCR...") # # High resolution to catch faint text # detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False, # det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3) # # ========================================== # # 🧠 LOGIC FIX 1: REMOVE NESTED BOXES # # ========================================== # def calculate_overlap_area(box1, box2): # """Calculates the intersection area between two boxes.""" # x1 = max(box1[0], box2[0]) # y1 = max(box1[1], box2[1]) # x2 = min(box1[2], box2[2]) # y2 = min(box1[3], box2[3]) # if x2 < x1 or y2 < y1: # return 0.0 # return (x2 - x1) * (y2 - y1) # def filter_nested_boxes(boxes, containment_thresh=0.80): # """ # Removes boxes that are mostly contained within other larger boxes. # """ # if not boxes: return [] # # Convert all to [x1, y1, x2, y2, area] # active = [] # for b in boxes: # area = (b[2] - b[0]) * (b[3] - b[1]) # active.append(list(b) + [area]) # # Sort by area (Largest to Smallest) - Crucial! # # We want to keep the big 'parent' box and delete the small 'child' box. # active.sort(key=lambda x: x[4], reverse=True) # final_boxes = [] # for i, current in enumerate(active): # is_nested = False # curr_area = current[4] # # Check against all boxes we've already accepted (which are bigger/same size) # for kept in final_boxes: # overlap = calculate_overlap_area(current, kept) # # Check if 'current' is inside 'kept' # # If >80% of current box is covered by kept box, it's a duplicate/nested box # if (overlap / curr_area) > containment_thresh: # is_nested = True # break # if not is_nested: # final_boxes.append(current[:4]) # Store only coord, drop area # return final_boxes # # ========================================== # # 🧠 LOGIC FIX 2: MERGE WORDS INTO LINES # # ========================================== # def merge_boxes_into_lines(raw_boxes, y_thresh=30): # if raw_boxes is None or len(raw_boxes) == 0: # return [] # # 1. Convert raw polygons to Axis-Aligned Rectangles # rects = [] # for box in raw_boxes: # box = np.array(box).astype(np.float32) # x1 = np.min(box[:, 0]) # y1 = np.min(box[:, 1]) # x2 = np.max(box[:, 0]) # y2 = np.max(box[:, 1]) # rects.append([x1, y1, x2, y2]) # # 🔴 STEP 2: Filter Nested Boxes (Remove the 'child' boxes) # rects = filter_nested_boxes(rects) # # 3. Sort by Y center # rects.sort(key=lambda r: (r[1] + r[3]) / 2) # merged_lines = [] # while rects: # current_line = [rects.pop(0)] # line_y_center = (current_line[0][1] + current_line[0][3]) / 2 # remaining = [] # for r in rects: # r_y_center = (r[1] + r[3]) / 2 # # If Y-center is close (same horizontal line) # if abs(r_y_center - line_y_center) < y_thresh: # current_line.append(r) # else: # remaining.append(r) # rects = remaining # # 4. Create Line Box # lx1 = min(r[0] for r in current_line) # ly1 = min(r[1] for r in current_line) # lx2 = max(r[2] for r in current_line) # ly2 = max(r[3] for r in current_line) # merged_lines.append([lx1, ly1, lx2, ly2]) # # Final Sort by Y # merged_lines.sort(key=lambda r: r[1]) # return merged_lines # def process_image(image): # if image is None: return None, [], "Please upload an image." # image_np = np.array(image.convert("RGB")) # # DETECT # try: # dt_boxes, _ = detector.text_detector(image_np) # except Exception as e: # return image, [], f"Detection Error: {str(e)}" # if dt_boxes is None or len(dt_boxes) == 0: # return image, [], "No text detected." # # PROCESS (Filter Nested -> Merge Lines) # line_boxes = merge_boxes_into_lines(dt_boxes) # annotated_img = image_np.copy() # results = [] # debug_crops = [] # for box in line_boxes: # x1, y1, x2, y2 = map(int, box) # # Filter Noise # if (x2 - x1) < 20 or (y2 - y1) < 15: # continue # # Draw (Green) # cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2) # # PADDING # PAD = 10 # h, w, _ = image_np.shape # x1 = max(0, x1 - PAD) # y1 = max(0, y1 - PAD) # x2 = min(w, x2 + PAD) # y2 = min(h, y2 + PAD) # crop = image_np[y1:y2, x1:x2] # pil_crop = Image.fromarray(crop) # debug_crops.append(pil_crop) # # RECOGNIZE # with torch.no_grad(): # pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device) # generated_ids = model.generate(pixel_values) # text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # if text.strip(): # results.append(text) # full_text = "\n".join(results) # return Image.fromarray(annotated_img), debug_crops, full_text # # --- UI --- # with gr.Blocks(theme=gr.themes.Soft()) as demo: # gr.Markdown("# ⚡ Smart Line-Level OCR (Cleaned)") # with gr.Row(): # with gr.Column(scale=1): # input_img = gr.Image(type="pil", label="Upload Image") # btn = gr.Button("Transcribe", variant="primary") # with gr.Column(scale=1): # output_img = gr.Image(label="Cleaned Lines (Green Boxes)") # output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True) # with gr.Row(): # gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200) # btn.click(process_image, input_img, [output_img, gallery, output_txt]) # if __name__ == "__main__": # demo.launch() # import gradio as gr # import torch # import numpy as np # import cv2 # from PIL import Image # from transformers import TrOCRProcessor, VisionEncoderDecoderModel # from paddleocr import PaddleOCR # # Setup # device = "cuda" if torch.cuda.is_available() else "cpu" # print(f"Loading TrOCR on {device}...") # processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten') # model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval() # print("Loading PaddleOCR...") # detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False, # det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3) # def calculate_iou(box1, box2): # """Calculate Intersection over Union""" # x1 = max(box1[0], box2[0]) # y1 = max(box1[1], box2[1]) # x2 = min(box1[2], box2[2]) # y2 = min(box1[3], box2[3]) # if x2 < x1 or y2 < y1: # return 0.0 # intersection = (x2 - x1) * (y2 - y1) # area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) # area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) # return intersection / min(area1, area2) # def remove_nested_boxes(boxes, iou_thresh=0.7): # """Remove boxes that are nested inside others""" # if len(boxes) == 0: # return [] # # Add area to each box # boxes_with_area = [] # for b in boxes: # area = (b[2] - b[0]) * (b[3] - b[1]) # boxes_with_area.append((*b, area)) # # Sort by area descending (keep larger boxes) # boxes_with_area.sort(key=lambda x: x[4], reverse=True) # keep = [] # for i, current in enumerate(boxes_with_area): # should_keep = True # curr_box = current[:4] # for kept in keep: # iou = calculate_iou(curr_box, kept) # if iou > iou_thresh: # should_keep = False # break # if should_keep: # keep.append(curr_box) # return keep # def merge_boxes_into_lines(raw_boxes, y_overlap_thresh=0.5, x_gap_thresh=100): # """Merge boxes into lines with better horizontal merging""" # if raw_boxes is None or len(raw_boxes) == 0: # return [] # # Convert polygons to rectangles # rects = [] # for box in raw_boxes: # box = np.array(box).astype(np.float32) # x1, y1 = np.min(box[:, 0]), np.min(box[:, 1]) # x2, y2 = np.max(box[:, 0]), np.max(box[:, 1]) # rects.append([x1, y1, x2, y2]) # # Remove nested boxes # rects = remove_nested_boxes(rects) # if len(rects) == 0: # return [] # # Sort by Y position # rects.sort(key=lambda r: r[1]) # # Group into lines based on Y overlap # lines = [] # current_line = [rects[0]] # for rect in rects[1:]: # # Check if rect belongs to current line # line_y1 = min(r[1] for r in current_line) # line_y2 = max(r[3] for r in current_line) # line_height = line_y2 - line_y1 # rect_y1, rect_y2 = rect[1], rect[3] # rect_height = rect_y2 - rect_y1 # # Calculate vertical overlap # overlap_y1 = max(line_y1, rect_y1) # overlap_y2 = min(line_y2, rect_y2) # overlap = max(0, overlap_y2 - overlap_y1) # # If significant vertical overlap, it's the same line # if overlap > y_overlap_thresh * min(line_height, rect_height): # current_line.append(rect) # else: # # Save current line and start new one # lines.append(current_line) # current_line = [rect] # lines.append(current_line) # # Merge boxes in each line # merged = [] # for line in lines: # # Sort line boxes left to right # line.sort(key=lambda r: r[0]) # # Merge horizontally close boxes # merged_line = [line[0]] # for rect in line[1:]: # last = merged_line[-1] # # If close horizontally, merge # if rect[0] - last[2] < x_gap_thresh: # merged_line[-1] = [ # min(last[0], rect[0]), # min(last[1], rect[1]), # max(last[2], rect[2]), # max(last[3], rect[3]) # ] # else: # merged_line.append(rect) # # Final merge: combine all boxes in line into one # x1 = min(r[0] for r in merged_line) # y1 = min(r[1] for r in merged_line) # x2 = max(r[2] for r in merged_line) # y2 = max(r[3] for r in merged_line) # merged.append([x1, y1, x2, y2]) # # Sort by Y # merged.sort(key=lambda r: r[1]) # return merged # def process_image(image): # if image is None: # return None, [], "Please upload an image." # image_np = np.array(image.convert("RGB")) # try: # dt_boxes, _ = detector.text_detector(image_np) # except Exception as e: # return image, [], f"Detection Error: {str(e)}" # if dt_boxes is None or len(dt_boxes) == 0: # return image, [], "No text detected." # line_boxes = merge_boxes_into_lines(dt_boxes) # annotated_img = image_np.copy() # results = [] # debug_crops = [] # for box in line_boxes: # x1, y1, x2, y2 = map(int, box) # if (x2 - x1) < 20 or (y2 - y1) < 15: # continue # cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2) # PAD = 10 # h, w, _ = image_np.shape # x1 = max(0, x1 - PAD) # y1 = max(0, y1 - PAD) # x2 = min(w, x2 + PAD) # y2 = min(h, y2 + PAD) # crop = image_np[y1:y2, x1:x2] # pil_crop = Image.fromarray(crop) # debug_crops.append(pil_crop) # with torch.no_grad(): # pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device) # generated_ids = model.generate(pixel_values) # text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # if text.strip(): # results.append(text) # full_text = "\n".join(results) # return Image.fromarray(annotated_img), debug_crops, full_text # with gr.Blocks(theme=gr.themes.Soft()) as demo: # gr.Markdown("# ⚡ Smart Line-Level OCR (Fixed)") # with gr.Row(): # with gr.Column(scale=1): # input_img = gr.Image(type="pil", label="Upload Image") # btn = gr.Button("Transcribe", variant="primary") # with gr.Column(scale=1): # output_img = gr.Image(label="Detected Lines") # output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True) # with gr.Row(): # gallery = gr.Gallery(label="Line Crops", columns=4, height=200) # btn.click(process_image, input_img, [output_img, gallery, output_txt]) # if __name__ == "__main__": # demo.launch() #https://github.com/czczup/FAST import gradio as gr import torch import numpy as np import cv2 from PIL import Image from transformers import TrOCRProcessor, VisionEncoderDecoderModel from paddleocr import PaddleOCR import pandas as pd # --- 1. SETUP TR-OCR --- device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Loading TrOCR on {device}...") processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten') model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval() # --- 2. SETUP PADDLEOCR --- print("Loading PaddleOCR...") # High resolution settings to detect faint text detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False, det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3) # ========================================== # 🧠 LOGIC: INTERSECTION OVER UNION (IOU) # ========================================== def calculate_iou_containment(box1, box2): """ Calculates how much of box1 is inside box2. Returns: ratio (0.0 to 1.0) """ x1 = max(box1[0], box2[0]) y1 = max(box1[1], box2[1]) x2 = min(box1[2], box2[2]) y2 = min(box1[3], box2[3]) if x2 < x1 or y2 < y1: return 0.0 intersection = (x2 - x1) * (y2 - y1) area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) return intersection / area1 def filter_nested_boxes(boxes, containment_thresh=0.85): """ Removes boxes that are mostly contained within other larger boxes. """ if not boxes: return [] # [x1, y1, x2, y2, area] active = [] for b in boxes: area = (b[2] - b[0]) * (b[3] - b[1]) active.append(list(b) + [area]) # Sort by Area descending (Biggest first) active.sort(key=lambda x: x[4], reverse=True) final_boxes = [] for current in active: is_nested = False curr_box = current[:4] # Check if this box is inside any bigger box we already kept for kept in final_boxes: overlap_ratio = calculate_iou_containment(curr_box, kept) if overlap_ratio > containment_thresh: is_nested = True break if not is_nested: final_boxes.append(curr_box) return final_boxes # ========================================== # 🧠 LOGIC: STRICT LINE MERGING # ========================================== def merge_boxes_into_lines(raw_boxes, log_data): """ Merges boxes horizontally but prevents vertical merging. """ if raw_boxes is None or len(raw_boxes) == 0: return [] # 1. Convert to Rects rects = [] for box in raw_boxes: box = np.array(box).astype(np.float32) x1, y1 = np.min(box[:, 0]), np.min(box[:, 1]) x2, y2 = np.max(box[:, 0]), np.max(box[:, 1]) rects.append([x1, y1, x2, y2]) log_data.append(f"Raw Detections: {len(rects)} boxes found.") # 2. Filter Nested rects = filter_nested_boxes(rects) log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.") # 3. Sort by Y-Center (Top to Bottom) rects.sort(key=lambda r: (r[1] + r[3]) / 2) lines = [] while rects: # Start a new line with the highest remaining box current_line = [rects.pop(0)] # Calculate the dynamic "height" of this line based on the first word ref_h = current_line[0][3] - current_line[0][1] ref_y_center = (current_line[0][1] + current_line[0][3]) / 2 # Look for other words on this SAME line # STRICT RULE: A box is on the same line ONLY if its Y-center # is within 50% of the reference box's height. vertical_tolerance = ref_h * 0.5 remaining_rects = [] for r in rects: r_y_center = (r[1] + r[3]) / 2 if abs(r_y_center - ref_y_center) < vertical_tolerance: current_line.append(r) else: remaining_rects.append(r) rects = remaining_rects # Sort words in this line left-to-right current_line.sort(key=lambda r: r[0]) # 4. Merge the horizontal group into ONE box lx1 = min(r[0] for r in current_line) ly1 = min(r[1] for r in current_line) lx2 = max(r[2] for r in current_line) ly2 = max(r[3] for r in current_line) lines.append([lx1, ly1, lx2, ly2]) # Final Sort by Y lines.sort(key=lambda r: r[1]) log_data.append(f"Final Merged Lines: {len(lines)} lines created.") return lines def process_image(image): logs = [] # Store debug messages here if image is None: return None, [], "Please upload an image.", "No logs." image_np = np.array(image.convert("RGB")) # DETECT try: dt_boxes, _ = detector.text_detector(image_np) except Exception as e: return image, [], f"Detection Error: {str(e)}", "\n".join(logs) if dt_boxes is None or len(dt_boxes) == 0: return image, [], "No text detected.", "\n".join(logs) # PROCESS line_boxes = merge_boxes_into_lines(dt_boxes, logs) annotated_img = image_np.copy() results = [] debug_crops = [] # Log the final box coordinates for inspection logs.append("\n--- Final Box Coordinates ---") for i, box in enumerate(line_boxes): x1, y1, x2, y2 = map(int, box) logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}") # Filter Noise if (x2 - x1) < 20 or (y2 - y1) < 15: logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)") continue # Draw (Green) cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2) # PADDING PAD = 10 h, w, _ = image_np.shape x1 = max(0, x1 - PAD) y1 = max(0, y1 - PAD) x2 = min(w, x2 + PAD) y2 = min(h, y2 + PAD) crop = image_np[y1:y2, x1:x2] pil_crop = Image.fromarray(crop) debug_crops.append(pil_crop) # RECOGNIZE with torch.no_grad(): pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device) generated_ids = model.generate(pixel_values) text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] if text.strip(): results.append(text) full_text = "\n".join(results) return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs) # --- UI --- with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# ⚡ Smart Line-Level OCR (Debug Mode)") with gr.Row(): with gr.Column(scale=1): input_img = gr.Image(type="pil", label="Upload Image") btn = gr.Button("Transcribe", variant="primary") with gr.Column(scale=1): with gr.Tabs(): with gr.Tab("Visualization"): output_img = gr.Image(label="Detected Lines") with gr.Tab("Extracted Text"): output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True) with gr.Tab("Debug Logs"): # CHANGED HERE: Uses Textbox instead of Code to avoid version errors log_output = gr.Textbox(label="Processing Logs", lines=20, interactive=False) with gr.Row(): gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200) btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output]) if __name__ == "__main__": demo.launch()