Spaces:

iammraat
/

ocr

Sleeping

ocr

File size: 34,178 Bytes

# import gradio as gr
# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
# import torch
# from PIL import Image

# # --- Model Setup ---
# # We load the model outside the inference function to cache it on startup
# MODEL_ID = "microsoft/trocr-base-handwritten"

# print(f"Loading {MODEL_ID}...")
# processor = TrOCRProcessor.from_pretrained(MODEL_ID)
# model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID)

# # Check for GPU (Free Spaces are usually CPU-only, but this handles upgrades)
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)
# print(f"Model loaded on device: {device}")

# # --- Inference Function ---
# def process_image(image):
#     if image is None:
#         return "Please upload an image."
    
#     try:
#         # 1. Convert to RGB (standardizes input)
#         image = image.convert("RGB")
        
#         # 2. Preprocess
#         pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
        
#         # 3. Generate text
#         generated_ids = model.generate(pixel_values)
#         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        
#         return generated_text
#     except Exception as e:
#         return f"Error: {str(e)}"

# # --- Gradio Interface ---
# # Using the Blocks API for a clean layout
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
#     gr.Markdown(
#         """
#         # ✍️ Handwritten Text Recognition
#         Using Microsoft's **TrOCR Small** model. Upload a handwritten note to transcribe it.
#         """
#     )
    
#     with gr.Row():
#         with gr.Column():
#             input_img = gr.Image(type="pil", label="Upload Image")
#             submit_btn = gr.Button("Transcribe", variant="primary")
        
#         with gr.Column():
#             output_text = gr.Textbox(label="Result", interactive=False)
            
#     # Examples help users test it immediately without uploading their own file
#     # (Uncomment the list below if you upload example images to your repo)
#     # gr.Examples(["sample1.jpg"], inputs=input_img)

#     submit_btn.click(fn=process_image, inputs=input_img, outputs=output_text)

# # Launch for Spaces
# if __name__ == "__main__":
#     demo.launch()















# import gradio as gr
# import torch
# import numpy as np
# import cv2
# from PIL import Image
# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
# from craft_text_detector import Craft

# # ==========================================
# # 🔧 PATCH 1: Fix Torchvision Compatibility
# # ==========================================
# import torchvision.models.vgg
# if not hasattr(torchvision.models.vgg, 'model_urls'):
#     torchvision.models.vgg.model_urls = {
#         'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth'
#     }

# # ==========================================
# # 🔧 PATCH 2: The "Ratio Net" Logic Fix
# # ==========================================
# import craft_text_detector.craft_utils as craft_utils_module

# def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=2):
#     if not polys:
#         return []
    
#     adjusted = []
#     for poly in polys:
#         if poly is None or len(poly) == 0:
#             continue
        
#         # Convert to numpy and reshape
#         p = np.array(poly).reshape(-1, 2)
        
#         # Scale correctly using ratio_net
#         p[:, 0] *= (ratio_w * ratio_net)
#         p[:, 1] *= (ratio_h * ratio_net)
        
#         adjusted.append(p)
    
#     return adjusted

# craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
# # ==========================================


# # --- 1. SETUP MODEL (Switched to BASE for stability) ---
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Loading TrOCR-Base on {device}...")

# # We use the 'base' model because 'small' hallucinates Wikipedia text on tight crops
# MODEL_ID = "microsoft/trocr-base-handwritten"
# processor = TrOCRProcessor.from_pretrained(MODEL_ID)
# model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID).to(device).eval()

# print("Loading CRAFT...")
# craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))


# # --- 2. HELPER FUNCTIONS ---
# def get_sorted_boxes(boxes):
#     """Sorts boxes top-to-bottom (lines), then left-to-right."""
#     if not boxes: return []
#     items = []
#     for box in boxes:
#         cy = np.mean(box[:, 1])
#         cx = np.mean(box[:, 0])
#         items.append((cy, cx, box))
    
#     # Sort by line (approx 20px tolerance) then by column
#     items.sort(key=lambda x: (int(x[0] // 20), x[1]))
#     return [x[2] for x in items]

# def process_image(image):
#     if image is None:
#         return None, [], "Please upload an image."

#     # Convert to standard RGB Numpy array
#     # We use the FULL resolution image (no resizing) to keep text sharp
#     image_np = np.array(image.convert("RGB"))

#     # 1. DETECT
#     # The patch ensures coordinates map perfectly to this full-res image
#     prediction = craft.detect_text(image_np)
#     boxes = prediction.get("boxes", [])
    
#     if not boxes:
#         return image, [], "No text detected."

#     sorted_boxes = get_sorted_boxes(boxes)
#     annotated_img = image_np.copy()
#     results = []
#     debug_crops = []
    
#     # 2. PROCESS BOXES
#     for box in sorted_boxes:
#         box_int = box.astype(np.int32)
        
#         # Draw the box (Visual verification)
#         cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 3)
        
#         # --- CROP WITH PADDING (Crucial Fix) ---
#         # TrOCR needs 'breathing room' or it hallucinates.
#         PADDING = 10 
        
#         x_min = max(0, np.min(box_int[:, 0]) - PADDING)
#         x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING)
#         y_min = max(0, np.min(box_int[:, 1]) - PADDING)
#         y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING)
        
#         # Skip noise
#         if (x_max - x_min) < 20 or (y_max - y_min) < 10:
#             continue
            
#         crop = image_np[y_min:y_max, x_min:x_max]
        
#         # Convert to PIL for Model
#         pil_crop = Image.fromarray(crop)
        
#         # Add to debug gallery so user can see what the model sees
#         debug_crops.append(pil_crop)
        
#         # 3. RECOGNIZE
#         with torch.no_grad():
#             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
#             generated_ids = model.generate(pixel_values)
#             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            
#             if text.strip():
#                 results.append(text)

#     full_text = "\n".join(results)
    
#     return Image.fromarray(annotated_img), debug_crops, full_text

# # --- 3. GRADIO UI ---
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
#     gr.Markdown("# 📝 Robust Handwritten OCR (Base Model)")
#     gr.Markdown("Includes padding and a stronger model to prevent hallucinations.")
    
#     with gr.Row():
#         with gr.Column(scale=1):
#             input_img = gr.Image(type="pil", label="Upload Image")
#             btn = gr.Button("Transcribe", variant="primary")
        
#         with gr.Column(scale=1):
#             output_img = gr.Image(label="Detections")
#             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
    
#     with gr.Row():
#         # Gallery to check if crops are valid or empty
#         crop_gallery = gr.Gallery(label="Debug: See what the model sees (Crops)", columns=6, height=200)

#     btn.click(process_image, input_img, [output_img, crop_gallery, output_txt])

# if __name__ == "__main__":
#     demo.launch()









# import gradio as gr
# import torch
# import numpy as np
# import cv2
# from PIL import Image
# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
# from paddleocr import PaddleOCR

# # --- 1. SETUP TR-OCR (Recognition) ---
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Loading TrOCR on {device}...")

# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()

# # --- 2. SETUP PADDLEOCR (Detection Only) ---
# print("Loading PaddleOCR (DBNet)...")
# # We load the detector but we will bypass the main .ocr() method to avoid bugs
# detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)

# def get_sorted_boxes(boxes):
#     """Sorts boxes top-to-bottom (lines), then left-to-right."""
#     if boxes is None or len(boxes) == 0:
#         return []
        
#     items = []
#     for box in boxes:
#         # Paddle returns boxes as numpy arrays or lists
#         box = np.array(box).astype(np.float32)
#         cy = np.mean(box[:, 1])
#         cx = np.mean(box[:, 0])
#         items.append((cy, cx, box))
    
#     # Sort by Y (line tolerance 20px) then X
#     items.sort(key=lambda x: (int(x[0] // 20), x[1]))
#     return [x[2] for x in items]

# def process_image(image):
#     if image is None:
#         return None, [], "Please upload an image."

#     # Convert to standard RGB Numpy array
#     image_np = np.array(image.convert("RGB"))

#     # ============================================================
#     # 🔴 FIX: Direct Detection Bypass
#     # ============================================================
#     # The standard 'detector.ocr()' method has a bug in the current 
#     # version that crashes when checking "if not boxes".
#     # We call the internal 'text_detector' directly to skip that check.
#     try:
#         dt_boxes, _ = detector.text_detector(image_np)
#     except Exception as e:
#         return image, [], f"Detection Error: {str(e)}"
        
#     if dt_boxes is None or len(dt_boxes) == 0:
#         return image, [], "No text detected."
    
#     # dt_boxes is already a numpy array of coordinates
#     sorted_boxes = get_sorted_boxes(dt_boxes)
    
#     annotated_img = image_np.copy()
#     results = []
#     debug_crops = []
    
#     # Process Boxes
#     for box in sorted_boxes:
#         box_int = box.astype(np.int32)
        
#         # Draw Box (Red, thickness 2)
#         cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 2)
        
#         # Crop with Padding (Prevents TrOCR Hallucinations)
#         PADDING = 10
#         x_min = max(0, np.min(box_int[:, 0]) - PADDING)
#         x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING)
#         y_min = max(0, np.min(box_int[:, 1]) - PADDING)
#         y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING)
        
#         # Skip noise
#         if (x_max - x_min) < 15 or (y_max - y_min) < 10:
#             continue
            
#         crop = image_np[y_min:y_max, x_min:x_max]
#         pil_crop = Image.fromarray(crop)
#         debug_crops.append(pil_crop)
        
#         # Recognition (TrOCR)
#         with torch.no_grad():
#             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
#             generated_ids = model.generate(pixel_values)
#             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            
#             if text.strip():
#                 results.append(text)

#     full_text = "\n".join(results)
    
#     return Image.fromarray(annotated_img), debug_crops, full_text

# # --- UI ---
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
#     gr.Markdown("# ⚡ PaddleOCR + TrOCR (Robust)")
#     gr.Markdown("Using direct DBNet inference to avoid library bugs.")
    
#     with gr.Row():
#         with gr.Column(scale=1):
#             input_img = gr.Image(type="pil", label="Upload Image")
#             btn = gr.Button("Transcribe", variant="primary")
        
#         with gr.Column(scale=1):
#             output_img = gr.Image(label="Detections (Paddle)")
#             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
            
#     with gr.Row():
#         gallery = gr.Gallery(label="Line Crops (Debug)", columns=6, height=200)

#     btn.click(process_image, input_img, [output_img, gallery, output_txt])

# if __name__ == "__main__":
#     demo.launch()











# import gradio as gr
# import torch
# import numpy as np
# import cv2
# from PIL import Image
# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
# from paddleocr import PaddleOCR

# # --- 1. SETUP TR-OCR ---
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Loading TrOCR on {device}...")
# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()

# # --- 2. SETUP PADDLEOCR ---
# print("Loading PaddleOCR...")
# # High resolution to catch faint text
# detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False, 
#                      det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)


# # ==========================================
# # 🧠 LOGIC FIX 1: REMOVE NESTED BOXES
# # ==========================================
# def calculate_overlap_area(box1, box2):
#     """Calculates the intersection area between two boxes."""
#     x1 = max(box1[0], box2[0])
#     y1 = max(box1[1], box2[1])
#     x2 = min(box1[2], box2[2])
#     y2 = min(box1[3], box2[3])
    
#     if x2 < x1 or y2 < y1:
#         return 0.0
#     return (x2 - x1) * (y2 - y1)

# def filter_nested_boxes(boxes, containment_thresh=0.80):
#     """
#     Removes boxes that are mostly contained within other larger boxes.
#     """
#     if not boxes: return []
    
#     # Convert all to [x1, y1, x2, y2, area]
#     active = []
#     for b in boxes:
#         area = (b[2] - b[0]) * (b[3] - b[1])
#         active.append(list(b) + [area])
    
#     # Sort by area (Largest to Smallest) - Crucial!
#     # We want to keep the big 'parent' box and delete the small 'child' box.
#     active.sort(key=lambda x: x[4], reverse=True)
    
#     final_boxes = []
    
#     for i, current in enumerate(active):
#         is_nested = False
#         curr_area = current[4]
        
#         # Check against all boxes we've already accepted (which are bigger/same size)
#         for kept in final_boxes:
#             overlap = calculate_overlap_area(current, kept)
            
#             # Check if 'current' is inside 'kept'
#             # If >80% of current box is covered by kept box, it's a duplicate/nested box
#             if (overlap / curr_area) > containment_thresh:
#                 is_nested = True
#                 break
        
#         if not is_nested:
#             final_boxes.append(current[:4]) # Store only coord, drop area
            
#     return final_boxes


# # ==========================================
# # 🧠 LOGIC FIX 2: MERGE WORDS INTO LINES
# # ==========================================
# def merge_boxes_into_lines(raw_boxes, y_thresh=30):
#     if raw_boxes is None or len(raw_boxes) == 0:
#         return []

#     # 1. Convert raw polygons to Axis-Aligned Rectangles
#     rects = []
#     for box in raw_boxes:
#         box = np.array(box).astype(np.float32)
#         x1 = np.min(box[:, 0])
#         y1 = np.min(box[:, 1])
#         x2 = np.max(box[:, 0])
#         y2 = np.max(box[:, 1])
#         rects.append([x1, y1, x2, y2])

#     # 🔴 STEP 2: Filter Nested Boxes (Remove the 'child' boxes)
#     rects = filter_nested_boxes(rects)

#     # 3. Sort by Y center
#     rects.sort(key=lambda r: (r[1] + r[3]) / 2)

#     merged_lines = []
#     while rects:
#         current_line = [rects.pop(0)]
#         line_y_center = (current_line[0][1] + current_line[0][3]) / 2
        
#         remaining = []
#         for r in rects:
#             r_y_center = (r[1] + r[3]) / 2
#             # If Y-center is close (same horizontal line)
#             if abs(r_y_center - line_y_center) < y_thresh:
#                 current_line.append(r)
#             else:
#                 remaining.append(r)
        
#         rects = remaining
        
#         # 4. Create Line Box
#         lx1 = min(r[0] for r in current_line)
#         ly1 = min(r[1] for r in current_line)
#         lx2 = max(r[2] for r in current_line)
#         ly2 = max(r[3] for r in current_line)
        
#         merged_lines.append([lx1, ly1, lx2, ly2])

#     # Final Sort by Y
#     merged_lines.sort(key=lambda r: r[1])
#     return merged_lines


# def process_image(image):
#     if image is None: return None, [], "Please upload an image."
#     image_np = np.array(image.convert("RGB"))

#     # DETECT
#     try:
#         dt_boxes, _ = detector.text_detector(image_np)
#     except Exception as e:
#         return image, [], f"Detection Error: {str(e)}"
        
#     if dt_boxes is None or len(dt_boxes) == 0:
#         return image, [], "No text detected."
    
#     # PROCESS (Filter Nested -> Merge Lines)
#     line_boxes = merge_boxes_into_lines(dt_boxes)
    
#     annotated_img = image_np.copy()
#     results = []
#     debug_crops = []
    
#     for box in line_boxes:
#         x1, y1, x2, y2 = map(int, box)
        
#         # Filter Noise
#         if (x2 - x1) < 20 or (y2 - y1) < 15:
#             continue
            
#         # Draw (Green)
#         cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        
#         # PADDING
#         PAD = 10
#         h, w, _ = image_np.shape
#         x1 = max(0, x1 - PAD)
#         y1 = max(0, y1 - PAD)
#         x2 = min(w, x2 + PAD)
#         y2 = min(h, y2 + PAD)
        
#         crop = image_np[y1:y2, x1:x2]
#         pil_crop = Image.fromarray(crop)
#         debug_crops.append(pil_crop)
        
#         # RECOGNIZE
#         with torch.no_grad():
#             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
#             generated_ids = model.generate(pixel_values)
#             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
#             if text.strip():
#                 results.append(text)

#     full_text = "\n".join(results)
#     return Image.fromarray(annotated_img), debug_crops, full_text

# # --- UI ---
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
#     gr.Markdown("# ⚡ Smart Line-Level OCR (Cleaned)")
    
#     with gr.Row():
#         with gr.Column(scale=1):
#             input_img = gr.Image(type="pil", label="Upload Image")
#             btn = gr.Button("Transcribe", variant="primary")
        
#         with gr.Column(scale=1):
#             output_img = gr.Image(label="Cleaned Lines (Green Boxes)")
#             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
            
#     with gr.Row():
#         gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)

#     btn.click(process_image, input_img, [output_img, gallery, output_txt])

# if __name__ == "__main__":
#     demo.launch()
















# import gradio as gr
# import torch
# import numpy as np
# import cv2
# from PIL import Image
# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
# from paddleocr import PaddleOCR

# # Setup
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Loading TrOCR on {device}...")
# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()

# print("Loading PaddleOCR...")
# detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False, 
#                      det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)

# def calculate_iou(box1, box2):
#     """Calculate Intersection over Union"""
#     x1 = max(box1[0], box2[0])
#     y1 = max(box1[1], box2[1])
#     x2 = min(box1[2], box2[2])
#     y2 = min(box1[3], box2[3])
    
#     if x2 < x1 or y2 < y1:
#         return 0.0
    
#     intersection = (x2 - x1) * (y2 - y1)
#     area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
#     area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    
#     return intersection / min(area1, area2)

# def remove_nested_boxes(boxes, iou_thresh=0.7):
#     """Remove boxes that are nested inside others"""
#     if len(boxes) == 0:
#         return []
    
#     # Add area to each box
#     boxes_with_area = []
#     for b in boxes:
#         area = (b[2] - b[0]) * (b[3] - b[1])
#         boxes_with_area.append((*b, area))
    
#     # Sort by area descending (keep larger boxes)
#     boxes_with_area.sort(key=lambda x: x[4], reverse=True)
    
#     keep = []
#     for i, current in enumerate(boxes_with_area):
#         should_keep = True
#         curr_box = current[:4]
        
#         for kept in keep:
#             iou = calculate_iou(curr_box, kept)
#             if iou > iou_thresh:
#                 should_keep = False
#                 break
        
#         if should_keep:
#             keep.append(curr_box)
    
#     return keep

# def merge_boxes_into_lines(raw_boxes, y_overlap_thresh=0.5, x_gap_thresh=100):
#     """Merge boxes into lines with better horizontal merging"""
#     if raw_boxes is None or len(raw_boxes) == 0:
#         return []
    
#     # Convert polygons to rectangles
#     rects = []
#     for box in raw_boxes:
#         box = np.array(box).astype(np.float32)
#         x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
#         x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
#         rects.append([x1, y1, x2, y2])
    
#     # Remove nested boxes
#     rects = remove_nested_boxes(rects)
    
#     if len(rects) == 0:
#         return []
    
#     # Sort by Y position
#     rects.sort(key=lambda r: r[1])
    
#     # Group into lines based on Y overlap
#     lines = []
#     current_line = [rects[0]]
    
#     for rect in rects[1:]:
#         # Check if rect belongs to current line
#         line_y1 = min(r[1] for r in current_line)
#         line_y2 = max(r[3] for r in current_line)
#         line_height = line_y2 - line_y1
        
#         rect_y1, rect_y2 = rect[1], rect[3]
#         rect_height = rect_y2 - rect_y1
        
#         # Calculate vertical overlap
#         overlap_y1 = max(line_y1, rect_y1)
#         overlap_y2 = min(line_y2, rect_y2)
#         overlap = max(0, overlap_y2 - overlap_y1)
        
#         # If significant vertical overlap, it's the same line
#         if overlap > y_overlap_thresh * min(line_height, rect_height):
#             current_line.append(rect)
#         else:
#             # Save current line and start new one
#             lines.append(current_line)
#             current_line = [rect]
    
#     lines.append(current_line)
    
#     # Merge boxes in each line
#     merged = []
#     for line in lines:
#         # Sort line boxes left to right
#         line.sort(key=lambda r: r[0])
        
#         # Merge horizontally close boxes
#         merged_line = [line[0]]
#         for rect in line[1:]:
#             last = merged_line[-1]
#             # If close horizontally, merge
#             if rect[0] - last[2] < x_gap_thresh:
#                 merged_line[-1] = [
#                     min(last[0], rect[0]),
#                     min(last[1], rect[1]),
#                     max(last[2], rect[2]),
#                     max(last[3], rect[3])
#                 ]
#             else:
#                 merged_line.append(rect)
        
#         # Final merge: combine all boxes in line into one
#         x1 = min(r[0] for r in merged_line)
#         y1 = min(r[1] for r in merged_line)
#         x2 = max(r[2] for r in merged_line)
#         y2 = max(r[3] for r in merged_line)
#         merged.append([x1, y1, x2, y2])
    
#     # Sort by Y
#     merged.sort(key=lambda r: r[1])
#     return merged

# def process_image(image):
#     if image is None:
#         return None, [], "Please upload an image."
    
#     image_np = np.array(image.convert("RGB"))
    
#     try:
#         dt_boxes, _ = detector.text_detector(image_np)
#     except Exception as e:
#         return image, [], f"Detection Error: {str(e)}"
    
#     if dt_boxes is None or len(dt_boxes) == 0:
#         return image, [], "No text detected."
    
#     line_boxes = merge_boxes_into_lines(dt_boxes)
    
#     annotated_img = image_np.copy()
#     results = []
#     debug_crops = []
    
#     for box in line_boxes:
#         x1, y1, x2, y2 = map(int, box)
        
#         if (x2 - x1) < 20 or (y2 - y1) < 15:
#             continue
        
#         cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        
#         PAD = 10
#         h, w, _ = image_np.shape
#         x1 = max(0, x1 - PAD)
#         y1 = max(0, y1 - PAD)
#         x2 = min(w, x2 + PAD)
#         y2 = min(h, y2 + PAD)
        
#         crop = image_np[y1:y2, x1:x2]
#         pil_crop = Image.fromarray(crop)
#         debug_crops.append(pil_crop)
        
#         with torch.no_grad():
#             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
#             generated_ids = model.generate(pixel_values)
#             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
#             if text.strip():
#                 results.append(text)
    
#     full_text = "\n".join(results)
#     return Image.fromarray(annotated_img), debug_crops, full_text

# with gr.Blocks(theme=gr.themes.Soft()) as demo:
#     gr.Markdown("# ⚡ Smart Line-Level OCR (Fixed)")
    
#     with gr.Row():
#         with gr.Column(scale=1):
#             input_img = gr.Image(type="pil", label="Upload Image")
#             btn = gr.Button("Transcribe", variant="primary")
        
#         with gr.Column(scale=1):
#             output_img = gr.Image(label="Detected Lines")
#             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
    
#     with gr.Row():
#         gallery = gr.Gallery(label="Line Crops", columns=4, height=200)
    
#     btn.click(process_image, input_img, [output_img, gallery, output_txt])

# if __name__ == "__main__":
#     demo.launch()


















#https://github.com/czczup/FAST


import gradio as gr
import torch
import numpy as np
import cv2
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from paddleocr import PaddleOCR
import pandas as pd

# --- 1. SETUP TR-OCR ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading TrOCR on {device}...")
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()

# --- 2. SETUP PADDLEOCR ---
print("Loading PaddleOCR...")
# High resolution settings to detect faint text
detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False, 
                     det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)


# ==========================================
# 🧠 LOGIC: INTERSECTION OVER UNION (IOU)
# ==========================================
def calculate_iou_containment(box1, box2):
    """
    Calculates how much of box1 is inside box2.
    Returns: ratio (0.0 to 1.0)
    """
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    if x2 < x1 or y2 < y1:
        return 0.0
    
    intersection = (x2 - x1) * (y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    
    return intersection / area1

def filter_nested_boxes(boxes, containment_thresh=0.85):
    """
    Removes boxes that are mostly contained within other larger boxes.
    """
    if not boxes: return []
    
    # [x1, y1, x2, y2, area]
    active = []
    for b in boxes:
        area = (b[2] - b[0]) * (b[3] - b[1])
        active.append(list(b) + [area])
    
    # Sort by Area descending (Biggest first)
    active.sort(key=lambda x: x[4], reverse=True)
    
    final_boxes = []
    
    for current in active:
        is_nested = False
        curr_box = current[:4]
        
        # Check if this box is inside any bigger box we already kept
        for kept in final_boxes:
            overlap_ratio = calculate_iou_containment(curr_box, kept)
            
            if overlap_ratio > containment_thresh:
                is_nested = True
                break
        
        if not is_nested:
            final_boxes.append(curr_box)
            
    return final_boxes


# ==========================================
# 🧠 LOGIC: STRICT LINE MERGING
# ==========================================
def merge_boxes_into_lines(raw_boxes, log_data):
    """
    Merges boxes horizontally but prevents vertical merging.
    """
    if raw_boxes is None or len(raw_boxes) == 0:
        return []

    # 1. Convert to Rects
    rects = []
    for box in raw_boxes:
        box = np.array(box).astype(np.float32)
        x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
        x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
        rects.append([x1, y1, x2, y2])

    log_data.append(f"Raw Detections: {len(rects)} boxes found.")

    # 2. Filter Nested
    rects = filter_nested_boxes(rects)
    log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.")

    # 3. Sort by Y-Center (Top to Bottom)
    rects.sort(key=lambda r: (r[1] + r[3]) / 2)

    lines = []
    
    while rects:
        # Start a new line with the highest remaining box
        current_line = [rects.pop(0)]
        
        # Calculate the dynamic "height" of this line based on the first word
        ref_h = current_line[0][3] - current_line[0][1]
        ref_y_center = (current_line[0][1] + current_line[0][3]) / 2
        
        # Look for other words on this SAME line
        # STRICT RULE: A box is on the same line ONLY if its Y-center 
        # is within 50% of the reference box's height.
        vertical_tolerance = ref_h * 0.5 
        
        remaining_rects = []
        for r in rects:
            r_y_center = (r[1] + r[3]) / 2
            
            if abs(r_y_center - ref_y_center) < vertical_tolerance:
                current_line.append(r)
            else:
                remaining_rects.append(r)
        
        rects = remaining_rects
        
        # Sort words in this line left-to-right
        current_line.sort(key=lambda r: r[0])
        
        # 4. Merge the horizontal group into ONE box
        lx1 = min(r[0] for r in current_line)
        ly1 = min(r[1] for r in current_line)
        lx2 = max(r[2] for r in current_line)
        ly2 = max(r[3] for r in current_line)
        
        lines.append([lx1, ly1, lx2, ly2])

    # Final Sort by Y
    lines.sort(key=lambda r: r[1])
    
    log_data.append(f"Final Merged Lines: {len(lines)} lines created.")
    return lines


def process_image(image):
    logs = [] # Store debug messages here
    
    if image is None: 
        return None, [], "Please upload an image.", "No logs."
    
    image_np = np.array(image.convert("RGB"))

    # DETECT
    try:
        dt_boxes, _ = detector.text_detector(image_np)
    except Exception as e:
        return image, [], f"Detection Error: {str(e)}", "\n".join(logs)
        
    if dt_boxes is None or len(dt_boxes) == 0:
        return image, [], "No text detected.", "\n".join(logs)
    
    # PROCESS
    line_boxes = merge_boxes_into_lines(dt_boxes, logs)
    
    annotated_img = image_np.copy()
    results = []
    debug_crops = []
    
    # Log the final box coordinates for inspection
    logs.append("\n--- Final Box Coordinates ---")
    
    for i, box in enumerate(line_boxes):
        x1, y1, x2, y2 = map(int, box)
        
        logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}")
        
        # Filter Noise
        if (x2 - x1) < 20 or (y2 - y1) < 15:
            logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)")
            continue
            
        # Draw (Green)
        cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        
        # PADDING
        PAD = 10
        h, w, _ = image_np.shape
        x1 = max(0, x1 - PAD)
        y1 = max(0, y1 - PAD)
        x2 = min(w, x2 + PAD)
        y2 = min(h, y2 + PAD)
        
        crop = image_np[y1:y2, x1:x2]
        pil_crop = Image.fromarray(crop)
        debug_crops.append(pil_crop)
        
        # RECOGNIZE
        with torch.no_grad():
            pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
            generated_ids = model.generate(pixel_values)
            text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            if text.strip():
                results.append(text)

    full_text = "\n".join(results)
    return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)

# --- UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ⚡ Smart Line-Level OCR (Debug Mode)")
    
    with gr.Row():
        with gr.Column(scale=1):
            input_img = gr.Image(type="pil", label="Upload Image")
            btn = gr.Button("Transcribe", variant="primary")
        
        with gr.Column(scale=1):
            with gr.Tabs():
                with gr.Tab("Visualization"):
                    output_img = gr.Image(label="Detected Lines")
                with gr.Tab("Extracted Text"):
                    output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
                with gr.Tab("Debug Logs"):
                    # CHANGED HERE: Uses Textbox instead of Code to avoid version errors
                    log_output = gr.Textbox(label="Processing Logs", lines=20, interactive=False)
            
    with gr.Row():
        gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)

    btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output])

if __name__ == "__main__":
    demo.launch()