Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

iammraat commited on Jan 27

Commit

2e99bc0

verified ·

1 Parent(s): 204a6d4

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -618

app.py CHANGED Viewed

@@ -816,257 +816,6 @@
-# import gradio as gr
-# import torch
-# import numpy as np
-# import cv2
-# from PIL import Image
-# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-# from paddleocr import PaddleOCR
-# import pandas as pd
-# # --- 1. SETUP TR-OCR ---
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# print(f"Loading TrOCR on {device}...")
-# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
-# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
-# # --- 2. SETUP PADDLEOCR ---
-# print("Loading PaddleOCR...")
-# # High resolution settings to detect faint text
-# detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
-#                      det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
-# # ==========================================
-# # 🧠 LOGIC: INTERSECTION OVER UNION (IOU)
-# # ==========================================
-# def calculate_iou_containment(box1, box2):
-#     """
-#     Calculates how much of box1 is inside box2.
-#     Returns: ratio (0.0 to 1.0)
-#     """
-#     x1 = max(box1[0], box2[0])
-#     y1 = max(box1[1], box2[1])
-#     x2 = min(box1[2], box2[2])
-#     y2 = min(box1[3], box2[3])
-#     if x2 < x1 or y2 < y1:
-#         return 0.0
-#     intersection = (x2 - x1) * (y2 - y1)
-#     area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
-#     return intersection / area1
-# def filter_nested_boxes(boxes, containment_thresh=0.85):
-#     """
-#     Removes boxes that are mostly contained within other larger boxes.
-#     """
-#     if not boxes: return []
-#     # [x1, y1, x2, y2, area]
-#     active = []
-#     for b in boxes:
-#         area = (b[2] - b[0]) * (b[3] - b[1])
-#         active.append(list(b) + [area])
-#     # Sort by Area descending (Biggest first)
-#     active.sort(key=lambda x: x[4], reverse=True)
-#     final_boxes = []
-#     for current in active:
-#         is_nested = False
-#         curr_box = current[:4]
-#         # Check if this box is inside any bigger box we already kept
-#         for kept in final_boxes:
-#             overlap_ratio = calculate_iou_containment(curr_box, kept)
-#             if overlap_ratio > containment_thresh:
-#                 is_nested = True
-#                 break
-#         if not is_nested:
-#             final_boxes.append(curr_box)
-#     return final_boxes
-# # ==========================================
-# # 🧠 LOGIC: STRICT LINE MERGING
-# # ==========================================
-# def merge_boxes_into_lines(raw_boxes, log_data):
-#     """
-#     Merges boxes horizontally but prevents vertical merging.
-#     """
-#     if raw_boxes is None or len(raw_boxes) == 0:
-#         return []
-#     # 1. Convert to Rects
-#     rects = []
-#     for box in raw_boxes:
-#         box = np.array(box).astype(np.float32)
-#         x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
-#         x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
-#         rects.append([x1, y1, x2, y2])
-#     log_data.append(f"Raw Detections: {len(rects)} boxes found.")
-#     # 2. Filter Nested
-#     rects = filter_nested_boxes(rects)
-#     log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.")
-#     # 3. Sort by Y-Center (Top to Bottom)
-#     rects.sort(key=lambda r: (r[1] + r[3]) / 2)
-#     lines = []
-#     while rects:
-#         # Start a new line with the highest remaining box
-#         current_line = [rects.pop(0)]
-#         # Calculate the dynamic "height" of this line based on the first word
-#         ref_h = current_line[0][3] - current_line[0][1]
-#         ref_y_center = (current_line[0][1] + current_line[0][3]) / 2
-#         # Look for other words on this SAME line
-#         # STRICT RULE: A box is on the same line ONLY if its Y-center
-#         # is within 50% of the reference box's height.
-#         vertical_tolerance = ref_h * 0.5
-#         remaining_rects = []
-#         for r in rects:
-#             r_y_center = (r[1] + r[3]) / 2
-#             if abs(r_y_center - ref_y_center) < vertical_tolerance:
-#                 current_line.append(r)
-#             else:
-#                 remaining_rects.append(r)
-#         rects = remaining_rects
-#         # Sort words in this line left-to-right
-#         current_line.sort(key=lambda r: r[0])
-#         # 4. Merge the horizontal group into ONE box
-#         lx1 = min(r[0] for r in current_line)
-#         ly1 = min(r[1] for r in current_line)
-#         lx2 = max(r[2] for r in current_line)
-#         ly2 = max(r[3] for r in current_line)
-#         lines.append([lx1, ly1, lx2, ly2])
-#     # Final Sort by Y
-#     lines.sort(key=lambda r: r[1])
-#     log_data.append(f"Final Merged Lines: {len(lines)} lines created.")
-#     return lines
-# def process_image(image):
-#     logs = [] # Store debug messages here
-#     if image is None:
-#         return None, [], "Please upload an image.", "No logs."
-#     image_np = np.array(image.convert("RGB"))
-#     # DETECT
-#     try:
-#         dt_boxes, _ = detector.text_detector(image_np)
-#     except Exception as e:
-#         return image, [], f"Detection Error: {str(e)}", "\n".join(logs)
-#     if dt_boxes is None or len(dt_boxes) == 0:
-#         return image, [], "No text detected.", "\n".join(logs)
-#     # PROCESS
-#     line_boxes = merge_boxes_into_lines(dt_boxes, logs)
-#     annotated_img = image_np.copy()
-#     results = []
-#     debug_crops = []
-#     # Log the final box coordinates for inspection
-#     logs.append("\n--- Final Box Coordinates ---")
-#     for i, box in enumerate(line_boxes):
-#         x1, y1, x2, y2 = map(int, box)
-#         logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}")
-#         # Filter Noise
-#         if (x2 - x1) < 20 or (y2 - y1) < 15:
-#             logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)")
-#             continue
-#         # Draw (Green)
-#         cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
-#         # PADDING
-#         PAD = 10
-#         h, w, _ = image_np.shape
-#         x1 = max(0, x1 - PAD)
-#         y1 = max(0, y1 - PAD)
-#         x2 = min(w, x2 + PAD)
-#         y2 = min(h, y2 + PAD)
-#         crop = image_np[y1:y2, x1:x2]
-#         pil_crop = Image.fromarray(crop)
-#         debug_crops.append(pil_crop)
-#         # RECOGNIZE
-#         with torch.no_grad():
-#             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
-#             generated_ids = model.generate(pixel_values)
-#             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-#             if text.strip():
-#                 results.append(text)
-#     full_text = "\n".join(results)
-#     return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
-# # --- UI ---
-# with gr.Blocks(theme=gr.themes.Soft()) as demo:
-#     gr.Markdown("# ⚡ Smart Line-Level OCR (Debug Mode)")
-#     with gr.Row():
-#         with gr.Column(scale=1):
-#             input_img = gr.Image(type="pil", label="Upload Image")
-#             btn = gr.Button("Transcribe", variant="primary")
-#         with gr.Column(scale=1):
-#             with gr.Tabs():
-#                 with gr.Tab("Visualization"):
-#                     output_img = gr.Image(label="Detected Lines")
-#                 with gr.Tab("Extracted Text"):
-#                     output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
-#                 with gr.Tab("Debug Logs"):
-#                     # CHANGED HERE: Uses Textbox instead of Code to avoid version errors
-#                     log_output = gr.Textbox(label="Processing Logs", lines=20, interactive=False)
-#     with gr.Row():
-#         gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)
-#     btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output])
-# if __name__ == "__main__":
-#     demo.launch()
 import gradio as gr
 import torch
 import numpy as np
@@ -1074,192 +823,64 @@ import cv2
 from PIL import Image
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from paddleocr import PaddleOCR
-from sklearn.cluster import DBSCAN
-from scipy.spatial.distance import pdist, squareform
-import warnings
-warnings.filterwarnings('ignore')
-# ==========================================
-# 🚀 SETUP MODELS
-# ==========================================
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Loading TrOCR on {device}...")
-# Upgraded to TrOCR-Large for better accuracy
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
 model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
 print("Loading PaddleOCR...")
-# Optimized settings for handwriting detection
-detector = PaddleOCR(
-    use_angle_cls=True,
-    lang='en',
-    show_log=False,
-    det_limit_side_len=2500,      # High resolution
-    det_db_thresh=0.2,            # More sensitive threshold
-    det_db_box_thresh=0.4,        # Better box filtering
-    det_db_unclip_ratio=1.8       # Larger text regions for handwriting
-)
 # ==========================================
-# 🧠 PREPROCESSING FOR HANDWRITING
 # ==========================================
-def preprocess_for_handwriting(image_np):
-    """
-    Enhanced preprocessing specifically for handwriting.
-    Returns preprocessed image for better detection.
-    """
-    # Convert to grayscale
-    gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
-    # Apply bilateral filter to reduce noise while preserving edges
-    denoised = cv2.bilateralFilter(gray, 9, 75, 75)
-    # Adaptive thresholding (better for varying lighting)
-    binary = cv2.adaptiveThreshold(
-        denoised, 255,
-        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-        cv2.THRESH_BINARY,
-        15, 10
-    )
-    # Optional: Deskew the image
-    coords = np.column_stack(np.where(binary > 0))
-    if len(coords) > 0:
-        angle = cv2.minAreaRect(coords)[-1]
-        if angle < -45:
-            angle = -(90 + angle)
-        else:
-            angle = -angle
-        # Only deskew if angle is significant (> 0.5 degrees)
-        if abs(angle) > 0.5:
-            (h, w) = binary.shape
-            center = (w // 2, h // 2)
-            M = cv2.getRotationMatrix2D(center, angle, 1.0)
-            binary = cv2.warpAffine(
-                binary, M, (w, h),
-                flags=cv2.INTER_CUBIC,
-                borderMode=cv2.BORDER_REPLICATE
-            )
-    # Convert back to RGB for PaddleOCR
-    return cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
-# ==========================================
-# 🧠 IMPROVED LINE DETECTION WITH DBSCAN
-# ==========================================
-def cluster_boxes_into_lines(raw_boxes, log_data, eps_multiplier=0.35):
     """
-    Uses DBSCAN clustering to intelligently group text boxes into lines.
-    This handles irregular handwriting baselines much better than rule-based methods.
-    Args:
-        eps_multiplier: Controls clustering sensitivity (lower = stricter line separation)
-                       Default 0.35 prevents multi-line merging
     """
-    if raw_boxes is None or len(raw_boxes) == 0:
-        return []
-    # 1. Convert PaddleOCR boxes to rectangles
-    rects = []
-    for box in raw_boxes:
-        box = np.array(box).astype(np.float32)
-        x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
-        x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
-        rects.append([x1, y1, x2, y2])
-    log_data.append(f"✓ Raw Detections: {len(rects)} boxes found.")
-    # 2. Filter out noise and very small boxes
-    filtered_rects = []
-    for rect in rects:
-        w = rect[2] - rect[0]
-        h = rect[3] - rect[1]
-        if w > 15 and h > 10:  # Minimum size threshold
-            filtered_rects.append(rect)
-    rects = filtered_rects
-    log_data.append(f"✓ After noise filtering: {len(rects)} boxes remain.")
-    if len(rects) == 0:
-        return []
-    # 3. Remove nested/overlapping boxes
-    rects = filter_nested_boxes(rects)
-    log_data.append(f"✓ After removing nested boxes: {len(rects)} boxes remain.")
-    # 4. DBSCAN clustering by Y-coordinate
-    # Extract y-centers for clustering
-    y_centers = np.array([(r[1] + r[3]) / 2 for r in rects])
-    # Calculate adaptive epsilon based on median box height
-    heights = np.array([r[3] - r[1] for r in rects])
-    median_height = np.median(heights)
-    # CRITICAL FIX: Lower multiplier to prevent multi-line merging
-    # 0.35 = strict line separation, 0.6 = more permissive (old value)
-    eps = median_height * eps_multiplier
-    log_data.append(f"✓ Clustering parameters: median_height={median_height:.1f}px, eps={eps:.1f}px (multiplier={eps_multiplier})")
-    # Perform clustering
-    clustering = DBSCAN(eps=eps, min_samples=1, metric='euclidean')
-    labels = clustering.fit_predict(y_centers.reshape(-1, 1))
-    log_data.append(f"✓ DBSCAN found {len(set(labels))} text lines.")
-    # 5. Group boxes by cluster labels
-    lines = []
-    for label in set(labels):
-        # Get all boxes in this cluster
-        line_boxes = [rects[i] for i, l in enumerate(labels) if l == label]
-        # Sort boxes left-to-right within the line
-        line_boxes.sort(key=lambda b: b[0])
-        # Merge into a single bounding box for the entire line
-        x1 = min(b[0] for b in line_boxes)
-        y1 = min(b[1] for b in line_boxes)
-        x2 = max(b[2] for b in line_boxes)
-        y2 = max(b[3] for b in line_boxes)
-        lines.append([x1, y1, x2, y2])
-    # Sort lines top-to-bottom
-    lines.sort(key=lambda r: r[1])
-    log_data.append(f"✓ Final merged lines: {len(lines)} lines created.\n")
-    return lines
 def filter_nested_boxes(boxes, containment_thresh=0.85):
     """
     Removes boxes that are mostly contained within other larger boxes.
-    This prevents duplicate detections.
     """
-    if not boxes:
-        return []
-    # Add area to each box
-    boxes_with_area = []
     for b in boxes:
         area = (b[2] - b[0]) * (b[3] - b[1])
-        boxes_with_area.append(list(b) + [area])
-    # Sort by area (largest first)
-    boxes_with_area.sort(key=lambda x: x[4], reverse=True)
     final_boxes = []
-    for current in boxes_with_area:
         is_nested = False
         curr_box = current[:4]
-        # Check if this box is contained within any already-kept box
         for kept in final_boxes:
             overlap_ratio = calculate_iou_containment(curr_box, kept)
@@ -1273,262 +894,178 @@ def filter_nested_boxes(boxes, containment_thresh=0.85):
     return final_boxes
-def calculate_iou_containment(box1, box2):
-    """
-    Calculates how much of box1 is inside box2.
-    Returns: ratio (0.0 to 1.0)
-    """
-    x1 = max(box1[0], box2[0])
-    y1 = max(box1[1], box2[1])
-    x2 = min(box1[2], box2[2])
-    y2 = min(box1[3], box2[3])
-    if x2 < x1 or y2 < y1:
-        return 0.0
-    intersection = (x2 - x1) * (y2 - y1)
-    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
-    if area1 == 0:
-        return 0.0
-    return intersection / area1
 # ==========================================
-# 🧠 ENHANCED TEXT RECOGNITION
 # ==========================================
-def recognize_text_batch(crops, batch_size=4):
     """
-    Process multiple crops in batches for better efficiency.
     """
-    results = []
-    for i in range(0, len(crops), batch_size):
-        batch_crops = crops[i:i+batch_size]
-        with torch.no_grad():
-            pixel_values = processor(
-                images=batch_crops,
-                return_tensors="pt"
-            ).pixel_values.to(device)
-            generated_ids = model.generate(
-                pixel_values,
-                max_length=64,
-                num_beams=4,           # Beam search for better quality
-                early_stopping=True
-            )
-            texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
-            results.extend(texts)
-    return results
-# ==========================================
-# 🎯 MAIN PROCESSING FUNCTION
-# ==========================================
-def process_image(image, use_preprocessing=True, eps_multiplier=0.35):
-    """
-    Main OCR pipeline with optional preprocessing.
-    Args:
-        image: Input PIL image
-        use_preprocessing: Whether to apply preprocessing
-        eps_multiplier: DBSCAN epsilon multiplier for line clustering
-    """
-    logs = []
     if image is None:
-        return None, [], "⚠️ Please upload an image.", "No logs."
-    logs.append("=" * 50)
-    logs.append("🚀 STARTING OCR PIPELINE")
-    logs.append("=" * 50 + "\n")
-    # Convert to numpy array
     image_np = np.array(image.convert("RGB"))
-    original_image = image_np.copy()
-    # Step 1: Preprocessing
-    if use_preprocessing:
-        logs.append("📝 Step 1: Preprocessing image for handwriting...")
-        preprocessed = preprocess_for_handwriting(image_np)
-        logs.append("✓ Preprocessing complete.\n")
-    else:
-        preprocessed = image_np
-        logs.append("📝 Step 1: Skipping preprocessing (disabled).\n")
-    # Step 2: Text Detection
-    logs.append("📝 Step 2: Detecting text regions...")
     try:
-        dt_boxes, _ = detector.text_detector(preprocessed)
     except Exception as e:
-        error_msg = f"❌ Detection Error: {str(e)}"
-        logs.append(error_msg)
-        return image, [], error_msg, "\n".join(logs)
     if dt_boxes is None or len(dt_boxes) == 0:
-        error_msg = "⚠️ No text detected in the image."
-        logs.append(error_msg)
-        return image, [], error_msg, "\n".join(logs)
-    # Step 3: Line Clustering
-    logs.append("\n📝 Step 3: Clustering text boxes into lines...")
-    line_boxes = cluster_boxes_into_lines(dt_boxes, logs, eps_multiplier=eps_multiplier)
-    if len(line_boxes) == 0:
-        error_msg = "⚠️ No valid text lines found after filtering."
-        logs.append(error_msg)
-        return image, [], error_msg, "\n".join(logs)
-    # Step 4: Extract and Recognize
-    logs.append("📝 Step 4: Extracting and recognizing text...\n")
-    logs.append("-" * 50)
-    annotated_img = original_image.copy()
     debug_crops = []
-    crop_images = []
     for i, box in enumerate(line_boxes):
         x1, y1, x2, y2 = map(int, box)
-        logs.append(f"Line {i+1}: [{x1}, {y1}, {x2}, {y2}] (w={x2-x1}, h={y2-y1})")
-        # Draw bounding box on visualization
-        color = (0, 255, 0)  # Green
-        cv2.rectangle(annotated_img, (x1, y1), (x2, y2), color, 2)
-        cv2.putText(annotated_img, f"L{i+1}", (x1, y1-5),
-                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
-        # Add padding for better recognition
         PAD = 10
-        h, w, _ = original_image.shape
-        x1_pad = max(0, x1 - PAD)
-        y1_pad = max(0, y1 - PAD)
-        x2_pad = min(w, x2 + PAD)
-        y2_pad = min(h, y2 + PAD)
-        # Crop the line
-        crop = original_image[y1_pad:y2_pad, x1_pad:x2_pad]
         pil_crop = Image.fromarray(crop)
-        crop_images.append(pil_crop)
         debug_crops.append(pil_crop)
-    logs.append("-" * 50)
-    logs.append(f"\n📝 Step 5: Running OCR on {len(crop_images)} line crops...")
-    # Batch recognition
-    recognized_texts = recognize_text_batch(crop_images, batch_size=4)
-    # Filter and log results
-    results = []
-    logs.append("\n" + "=" * 50)
-    logs.append("📄 RECOGNITION RESULTS")
-    logs.append("=" * 50 + "\n")
-    for i, text in enumerate(recognized_texts):
-        text = text.strip()
-        if text:
-            results.append(text)
-            logs.append(f"Line {i+1}: {text}")
-        else:
-            logs.append(f"Line {i+1}: [empty]")
-    # Final output
     full_text = "\n".join(results)
-    logs.append("\n" + "=" * 50)
-    logs.append(f"✅ COMPLETE: {len(results)} lines transcribed.")
-    logs.append("=" * 50)
     return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
-# ==========================================
-# 🎨 GRADIO UI
-# ==========================================
-with gr.Blocks(theme=gr.themes.Soft(), title="Advanced OCR with DBSCAN") as demo:
-    gr.Markdown("""
-    # 🔬 Advanced Handwriting OCR with DBSCAN Clustering
-    **Improvements:**
-    - 🎯 DBSCAN clustering for intelligent line detection
-    - 🔍 TrOCR-Large model for better accuracy
-    - 🖼️ Preprocessing pipeline for handwriting
-    - ⚡ Batch processing for efficiency
-    - 📊 Detailed debug logs
-    """)
     with gr.Row():
         with gr.Column(scale=1):
-            input_img = gr.Image(type="pil", label="📤 Upload Handwritten Image")
-            with gr.Accordion("⚙️ Options", open=False):
-                use_preprocess = gr.Checkbox(
-                    label="Enable preprocessing (denoising, deskewing)",
-                    value=True,
-                    info="Recommended for photos and low-quality scans"
-                )
-                eps_slider = gr.Slider(
-                    minimum=0.2,
-                    maximum=0.8,
-                    value=0.35,
-                    step=0.05,
-                    label="Line Separation Sensitivity",
-                    info="Lower = stricter separation (0.35 recommended for tight handwriting)"
-                )
-            btn = gr.Button("🚀 Transcribe", variant="primary", size="lg")
         with gr.Column(scale=1):
             with gr.Tabs():
-                with gr.Tab("🖼️ Visualization"):
                     output_img = gr.Image(label="Detected Lines")
-                    gr.Markdown("*Green boxes show detected text lines with line numbers*")
-                with gr.Tab("📝 Extracted Text"):
-                    output_txt = gr.Textbox(
-                        label="Recognized Text",
-                        lines=15,
-                        show_copy_button=True,
-                        placeholder="Transcribed text will appear here..."
-                    )
-                with gr.Tab("🔍 Debug Logs"):
-                    log_output = gr.Textbox(
-                        label="Processing Logs",
-                        lines=20,
-                        interactive=False
-                    )
     with gr.Row():
-        gallery = gr.Gallery(
-            label="📸 Line Crops (For Debugging)",
-            columns=4,
-            height=200,
-            object_fit="contain"
-        )
-    gr.Markdown("""
-    ---
-    ### 💡 Tips for Best Results:
-    - Upload clear, high-contrast images
-    - Ensure text is not too small (minimum 15px height)
-    - Try enabling/disabling preprocessing based on your image quality
-    - Check debug logs if results are unexpected
-    """)
-    # Connect button to processing function
-    btn.click(
-        fn=process_image,
-        inputs=[input_img, use_preprocess, eps_slider],
-        outputs=[output_img, gallery, output_txt, log_output]
-    )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torch
 import numpy as np
 from PIL import Image
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from paddleocr import PaddleOCR
+import pandas as pd
+# --- 1. SETUP TR-OCR ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Loading TrOCR on {device}...")
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
 model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
+# --- 2. SETUP PADDLEOCR ---
 print("Loading PaddleOCR...")
+# High resolution settings to detect faint text
+detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
+                     det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
 # ==========================================
+# 🧠 LOGIC: INTERSECTION OVER UNION (IOU)
 # ==========================================
+def calculate_iou_containment(box1, box2):
     """
+    Calculates how much of box1 is inside box2.
+    Returns: ratio (0.0 to 1.0)
     """
+    x1 = max(box1[0], box2[0])
+    y1 = max(box1[1], box2[1])
+    x2 = min(box1[2], box2[2])
+    y2 = min(box1[3], box2[3])
+    if x2 < x1 or y2 < y1:
+        return 0.0
+    intersection = (x2 - x1) * (y2 - y1)
+    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    return intersection / area1
 def filter_nested_boxes(boxes, containment_thresh=0.85):
     """
     Removes boxes that are mostly contained within other larger boxes.
     """
+    if not boxes: return []
+    # [x1, y1, x2, y2, area]
+    active = []
     for b in boxes:
         area = (b[2] - b[0]) * (b[3] - b[1])
+        active.append(list(b) + [area])
+    # Sort by Area descending (Biggest first)
+    active.sort(key=lambda x: x[4], reverse=True)
     final_boxes = []
+    for current in active:
         is_nested = False
         curr_box = current[:4]
+        # Check if this box is inside any bigger box we already kept
         for kept in final_boxes:
             overlap_ratio = calculate_iou_containment(curr_box, kept)
     return final_boxes
 # ==========================================
+# 🧠 LOGIC: STRICT LINE MERGING
 # ==========================================
+def merge_boxes_into_lines(raw_boxes, log_data):
     """
+    Merges boxes horizontally but prevents vertical merging.
     """
+    if raw_boxes is None or len(raw_boxes) == 0:
+        return []
+    # 1. Convert to Rects
+    rects = []
+    for box in raw_boxes:
+        box = np.array(box).astype(np.float32)
+        x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
+        x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
+        rects.append([x1, y1, x2, y2])
+    log_data.append(f"Raw Detections: {len(rects)} boxes found.")
+    # 2. Filter Nested
+    rects = filter_nested_boxes(rects)
+    log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.")
+    # 3. Sort by Y-Center (Top to Bottom)
+    rects.sort(key=lambda r: (r[1] + r[3]) / 2)
+    lines = []
+    while rects:
+        # Start a new line with the highest remaining box
+        current_line = [rects.pop(0)]
+        # Calculate the dynamic "height" of this line based on the first word
+        ref_h = current_line[0][3] - current_line[0][1]
+        ref_y_center = (current_line[0][1] + current_line[0][3]) / 2
+        # Look for other words on this SAME line
+        # STRICT RULE: A box is on the same line ONLY if its Y-center
+        # is within 50% of the reference box's height.
+        vertical_tolerance = ref_h * 0.5
+        remaining_rects = []
+        for r in rects:
+            r_y_center = (r[1] + r[3]) / 2
+            if abs(r_y_center - ref_y_center) < vertical_tolerance:
+                current_line.append(r)
+            else:
+                remaining_rects.append(r)
+        rects = remaining_rects
+        # Sort words in this line left-to-right
+        current_line.sort(key=lambda r: r[0])
+        # 4. Merge the horizontal group into ONE box
+        lx1 = min(r[0] for r in current_line)
+        ly1 = min(r[1] for r in current_line)
+        lx2 = max(r[2] for r in current_line)
+        ly2 = max(r[3] for r in current_line)
+        lines.append([lx1, ly1, lx2, ly2])
+    # Final Sort by Y
+    lines.sort(key=lambda r: r[1])
+    log_data.append(f"Final Merged Lines: {len(lines)} lines created.")
+    return lines
+def process_image(image):
+    logs = [] # Store debug messages here
     if image is None:
+        return None, [], "Please upload an image.", "No logs."
     image_np = np.array(image.convert("RGB"))
+    # DETECT
     try:
+        dt_boxes, _ = detector.text_detector(image_np)
     except Exception as e:
+        return image, [], f"Detection Error: {str(e)}", "\n".join(logs)
     if dt_boxes is None or len(dt_boxes) == 0:
+        return image, [], "No text detected.", "\n".join(logs)
+    # PROCESS
+    line_boxes = merge_boxes_into_lines(dt_boxes, logs)
+    annotated_img = image_np.copy()
+    results = []
     debug_crops = []
+    # Log the final box coordinates for inspection
+    logs.append("\n--- Final Box Coordinates ---")
     for i, box in enumerate(line_boxes):
         x1, y1, x2, y2 = map(int, box)
+        logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}")
+        # Filter Noise
+        if (x2 - x1) < 20 or (y2 - y1) < 15:
+            logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)")
+            continue
+        # Draw (Green)
+        cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
+        # PADDING
         PAD = 10
+        h, w, _ = image_np.shape
+        x1 = max(0, x1 - PAD)
+        y1 = max(0, y1 - PAD)
+        x2 = min(w, x2 + PAD)
+        y2 = min(h, y2 + PAD)
+        crop = image_np[y1:y2, x1:x2]
         pil_crop = Image.fromarray(crop)
         debug_crops.append(pil_crop)
+        # RECOGNIZE
+        with torch.no_grad():
+            pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
+            generated_ids = model.generate(pixel_values)
+            text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            if text.strip():
+                results.append(text)
     full_text = "\n".join(results)
     return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)
+# --- UI ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# ⚡ Smart Line-Level OCR (Debug Mode)")
     with gr.Row():
         with gr.Column(scale=1):
+            input_img = gr.Image(type="pil", label="Upload Image")
+            btn = gr.Button("Transcribe", variant="primary")
         with gr.Column(scale=1):
             with gr.Tabs():
+                with gr.Tab("Visualization"):
                     output_img = gr.Image(label="Detected Lines")
+                with gr.Tab("Extracted Text"):
+                    output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
+                with gr.Tab("Debug Logs"):
+                    # CHANGED HERE: Uses Textbox instead of Code to avoid version errors
+                    log_output = gr.Textbox(label="Processing Logs", lines=20, interactive=False)
     with gr.Row():
+        gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)
+    btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output])
 if __name__ == "__main__":
     demo.launch()