Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

ocr / app.py

iammraat

Update app.py

55d6595 verified 15 days ago

raw

history blame contribute delete

34.2 kB

	# import gradio as gr
	# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
	# import torch
	# from PIL import Image

	# # --- Model Setup ---
	# # We load the model outside the inference function to cache it on startup
	# MODEL_ID = "microsoft/trocr-base-handwritten"

	# print(f"Loading {MODEL_ID}...")
	# processor = TrOCRProcessor.from_pretrained(MODEL_ID)
	# model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID)

	# # Check for GPU (Free Spaces are usually CPU-only, but this handles upgrades)
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	# model.to(device)
	# print(f"Model loaded on device: {device}")

	# # --- Inference Function ---
	# def process_image(image):
	# if image is None:
	# return "Please upload an image."

	# try:
	# # 1. Convert to RGB (standardizes input)
	# image = image.convert("RGB")

	# # 2. Preprocess
	# pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

	# # 3. Generate text
	# generated_ids = model.generate(pixel_values)
	# generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

	# return generated_text
	# except Exception as e:
	# return f"Error: {str(e)}"

	# # --- Gradio Interface ---
	# # Using the Blocks API for a clean layout
	# with gr.Blocks(theme=gr.themes.Soft()) as demo:
	# gr.Markdown(
	# """
	# # ✍️ Handwritten Text Recognition
	# Using Microsoft's TrOCR Small model. Upload a handwritten note to transcribe it.
	# """
	# )

	# with gr.Row():
	# with gr.Column():
	# input_img = gr.Image(type="pil", label="Upload Image")
	# submit_btn = gr.Button("Transcribe", variant="primary")

	# with gr.Column():
	# output_text = gr.Textbox(label="Result", interactive=False)

	# # Examples help users test it immediately without uploading their own file
	# # (Uncomment the list below if you upload example images to your repo)
	# # gr.Examples(["sample1.jpg"], inputs=input_img)

	# submit_btn.click(fn=process_image, inputs=input_img, outputs=output_text)

	# # Launch for Spaces
	# if __name__ == "__main__":
	# demo.launch()















	# import gradio as gr
	# import torch
	# import numpy as np
	# import cv2
	# from PIL import Image
	# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
	# from craft_text_detector import Craft

	# # ==========================================
	# # 🔧 PATCH 1: Fix Torchvision Compatibility
	# # ==========================================
	# import torchvision.models.vgg
	# if not hasattr(torchvision.models.vgg, 'model_urls'):
	# torchvision.models.vgg.model_urls = {
	# 'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth'
	# }

	# # ==========================================
	# # 🔧 PATCH 2: The "Ratio Net" Logic Fix
	# # ==========================================
	# import craft_text_detector.craft_utils as craft_utils_module

	# def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=2):
	# if not polys:
	# return []

	# adjusted = []
	# for poly in polys:
	# if poly is None or len(poly) == 0:
	# continue

	# # Convert to numpy and reshape
	# p = np.array(poly).reshape(-1, 2)

	# # Scale correctly using ratio_net
	# p[:, 0] = (ratio_w ratio_net)
	# p[:, 1] = (ratio_h ratio_net)

	# adjusted.append(p)

	# return adjusted

	# craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
	# # ==========================================


	# # --- 1. SETUP MODEL (Switched to BASE for stability) ---
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	# print(f"Loading TrOCR-Base on {device}...")

	# # We use the 'base' model because 'small' hallucinates Wikipedia text on tight crops
	# MODEL_ID = "microsoft/trocr-base-handwritten"
	# processor = TrOCRProcessor.from_pretrained(MODEL_ID)
	# model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID).to(device).eval()

	# print("Loading CRAFT...")
	# craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))


	# # --- 2. HELPER FUNCTIONS ---
	# def get_sorted_boxes(boxes):
	# """Sorts boxes top-to-bottom (lines), then left-to-right."""
	# if not boxes: return []
	# items = []
	# for box in boxes:
	# cy = np.mean(box[:, 1])
	# cx = np.mean(box[:, 0])
	# items.append((cy, cx, box))

	# # Sort by line (approx 20px tolerance) then by column
	# items.sort(key=lambda x: (int(x[0] // 20), x[1]))
	# return [x[2] for x in items]

	# def process_image(image):
	# if image is None:
	# return None, [], "Please upload an image."

	# # Convert to standard RGB Numpy array
	# # We use the FULL resolution image (no resizing) to keep text sharp
	# image_np = np.array(image.convert("RGB"))

	# # 1. DETECT
	# # The patch ensures coordinates map perfectly to this full-res image
	# prediction = craft.detect_text(image_np)
	# boxes = prediction.get("boxes", [])

	# if not boxes:
	# return image, [], "No text detected."

	# sorted_boxes = get_sorted_boxes(boxes)
	# annotated_img = image_np.copy()
	# results = []
	# debug_crops = []

	# # 2. PROCESS BOXES
	# for box in sorted_boxes:
	# box_int = box.astype(np.int32)

	# # Draw the box (Visual verification)
	# cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 3)

	# # --- CROP WITH PADDING (Crucial Fix) ---
	# # TrOCR needs 'breathing room' or it hallucinates.
	# PADDING = 10

	# x_min = max(0, np.min(box_int[:, 0]) - PADDING)
	# x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING)
	# y_min = max(0, np.min(box_int[:, 1]) - PADDING)
	# y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING)

	# # Skip noise
	# if (x_max - x_min) < 20 or (y_max - y_min) < 10:
	# continue

	# crop = image_np[y_min:y_max, x_min:x_max]

	# # Convert to PIL for Model
	# pil_crop = Image.fromarray(crop)

	# # Add to debug gallery so user can see what the model sees
	# debug_crops.append(pil_crop)

	# # 3. RECOGNIZE
	# with torch.no_grad():
	# pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
	# generated_ids = model.generate(pixel_values)
	# text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

	# if text.strip():
	# results.append(text)

	# full_text = "\n".join(results)

	# return Image.fromarray(annotated_img), debug_crops, full_text

	# # --- 3. GRADIO UI ---
	# with gr.Blocks(theme=gr.themes.Soft()) as demo:
	# gr.Markdown("# 📝 Robust Handwritten OCR (Base Model)")
	# gr.Markdown("Includes padding and a stronger model to prevent hallucinations.")

	# with gr.Row():
	# with gr.Column(scale=1):
	# input_img = gr.Image(type="pil", label="Upload Image")
	# btn = gr.Button("Transcribe", variant="primary")

	# with gr.Column(scale=1):
	# output_img = gr.Image(label="Detections")
	# output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)

	# with gr.Row():
	# # Gallery to check if crops are valid or empty
	# crop_gallery = gr.Gallery(label="Debug: See what the model sees (Crops)", columns=6, height=200)

	# btn.click(process_image, input_img, [output_img, crop_gallery, output_txt])

	# if __name__ == "__main__":
	# demo.launch()









	# import gradio as gr
	# import torch
	# import numpy as np
	# import cv2
	# from PIL import Image
	# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
	# from paddleocr import PaddleOCR

	# # --- 1. SETUP TR-OCR (Recognition) ---
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	# print(f"Loading TrOCR on {device}...")

	# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
	# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()

	# # --- 2. SETUP PADDLEOCR (Detection Only) ---
	# print("Loading PaddleOCR (DBNet)...")
	# # We load the detector but we will bypass the main .ocr() method to avoid bugs
	# detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)

	# def get_sorted_boxes(boxes):
	# """Sorts boxes top-to-bottom (lines), then left-to-right."""
	# if boxes is None or len(boxes) == 0:
	# return []

	# items = []
	# for box in boxes:
	# # Paddle returns boxes as numpy arrays or lists
	# box = np.array(box).astype(np.float32)
	# cy = np.mean(box[:, 1])
	# cx = np.mean(box[:, 0])
	# items.append((cy, cx, box))

	# # Sort by Y (line tolerance 20px) then X
	# items.sort(key=lambda x: (int(x[0] // 20), x[1]))
	# return [x[2] for x in items]

	# def process_image(image):
	# if image is None:
	# return None, [], "Please upload an image."

	# # Convert to standard RGB Numpy array
	# image_np = np.array(image.convert("RGB"))

	# # ============================================================
	# # 🔴 FIX: Direct Detection Bypass
	# # ============================================================
	# # The standard 'detector.ocr()' method has a bug in the current
	# # version that crashes when checking "if not boxes".
	# # We call the internal 'text_detector' directly to skip that check.
	# try:
	# dt_boxes, _ = detector.text_detector(image_np)
	# except Exception as e:
	# return image, [], f"Detection Error: {str(e)}"

	# if dt_boxes is None or len(dt_boxes) == 0:
	# return image, [], "No text detected."

	# # dt_boxes is already a numpy array of coordinates
	# sorted_boxes = get_sorted_boxes(dt_boxes)

	# annotated_img = image_np.copy()
	# results = []
	# debug_crops = []

	# # Process Boxes
	# for box in sorted_boxes:
	# box_int = box.astype(np.int32)

	# # Draw Box (Red, thickness 2)
	# cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 2)

	# # Crop with Padding (Prevents TrOCR Hallucinations)
	# PADDING = 10
	# x_min = max(0, np.min(box_int[:, 0]) - PADDING)
	# x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING)
	# y_min = max(0, np.min(box_int[:, 1]) - PADDING)
	# y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING)

	# # Skip noise
	# if (x_max - x_min) < 15 or (y_max - y_min) < 10:
	# continue

	# crop = image_np[y_min:y_max, x_min:x_max]
	# pil_crop = Image.fromarray(crop)
	# debug_crops.append(pil_crop)

	# # Recognition (TrOCR)
	# with torch.no_grad():
	# pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
	# generated_ids = model.generate(pixel_values)
	# text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

	# if text.strip():
	# results.append(text)

	# full_text = "\n".join(results)

	# return Image.fromarray(annotated_img), debug_crops, full_text

	# # --- UI ---
	# with gr.Blocks(theme=gr.themes.Soft()) as demo:
	# gr.Markdown("# ⚡ PaddleOCR + TrOCR (Robust)")
	# gr.Markdown("Using direct DBNet inference to avoid library bugs.")

	# with gr.Row():
	# with gr.Column(scale=1):
	# input_img = gr.Image(type="pil", label="Upload Image")
	# btn = gr.Button("Transcribe", variant="primary")

	# with gr.Column(scale=1):
	# output_img = gr.Image(label="Detections (Paddle)")
	# output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)

	# with gr.Row():
	# gallery = gr.Gallery(label="Line Crops (Debug)", columns=6, height=200)

	# btn.click(process_image, input_img, [output_img, gallery, output_txt])

	# if __name__ == "__main__":
	# demo.launch()











	# import gradio as gr
	# import torch
	# import numpy as np
	# import cv2
	# from PIL import Image
	# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
	# from paddleocr import PaddleOCR

	# # --- 1. SETUP TR-OCR ---
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	# print(f"Loading TrOCR on {device}...")
	# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
	# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()

	# # --- 2. SETUP PADDLEOCR ---
	# print("Loading PaddleOCR...")
	# # High resolution to catch faint text
	# detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
	# det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)


	# # ==========================================
	# # 🧠 LOGIC FIX 1: REMOVE NESTED BOXES
	# # ==========================================
	# def calculate_overlap_area(box1, box2):
	# """Calculates the intersection area between two boxes."""
	# x1 = max(box1[0], box2[0])
	# y1 = max(box1[1], box2[1])
	# x2 = min(box1[2], box2[2])
	# y2 = min(box1[3], box2[3])

	# if x2 < x1 or y2 < y1:
	# return 0.0
	# return (x2 - x1) * (y2 - y1)

	# def filter_nested_boxes(boxes, containment_thresh=0.80):
	# """
	# Removes boxes that are mostly contained within other larger boxes.
	# """
	# if not boxes: return []

	# # Convert all to [x1, y1, x2, y2, area]
	# active = []
	# for b in boxes:
	# area = (b[2] - b[0]) * (b[3] - b[1])
	# active.append(list(b) + [area])

	# # Sort by area (Largest to Smallest) - Crucial!
	# # We want to keep the big 'parent' box and delete the small 'child' box.
	# active.sort(key=lambda x: x[4], reverse=True)

	# final_boxes = []

	# for i, current in enumerate(active):
	# is_nested = False
	# curr_area = current[4]

	# # Check against all boxes we've already accepted (which are bigger/same size)
	# for kept in final_boxes:
	# overlap = calculate_overlap_area(current, kept)

	# # Check if 'current' is inside 'kept'
	# # If >80% of current box is covered by kept box, it's a duplicate/nested box
	# if (overlap / curr_area) > containment_thresh:
	# is_nested = True
	# break

	# if not is_nested:
	# final_boxes.append(current[:4]) # Store only coord, drop area

	# return final_boxes


	# # ==========================================
	# # 🧠 LOGIC FIX 2: MERGE WORDS INTO LINES
	# # ==========================================
	# def merge_boxes_into_lines(raw_boxes, y_thresh=30):
	# if raw_boxes is None or len(raw_boxes) == 0:
	# return []

	# # 1. Convert raw polygons to Axis-Aligned Rectangles
	# rects = []
	# for box in raw_boxes:
	# box = np.array(box).astype(np.float32)
	# x1 = np.min(box[:, 0])
	# y1 = np.min(box[:, 1])
	# x2 = np.max(box[:, 0])
	# y2 = np.max(box[:, 1])
	# rects.append([x1, y1, x2, y2])

	# # 🔴 STEP 2: Filter Nested Boxes (Remove the 'child' boxes)
	# rects = filter_nested_boxes(rects)

	# # 3. Sort by Y center
	# rects.sort(key=lambda r: (r[1] + r[3]) / 2)

	# merged_lines = []
	# while rects:
	# current_line = [rects.pop(0)]
	# line_y_center = (current_line[0][1] + current_line[0][3]) / 2

	# remaining = []
	# for r in rects:
	# r_y_center = (r[1] + r[3]) / 2
	# # If Y-center is close (same horizontal line)
	# if abs(r_y_center - line_y_center) < y_thresh:
	# current_line.append(r)
	# else:
	# remaining.append(r)

	# rects = remaining

	# # 4. Create Line Box
	# lx1 = min(r[0] for r in current_line)
	# ly1 = min(r[1] for r in current_line)
	# lx2 = max(r[2] for r in current_line)
	# ly2 = max(r[3] for r in current_line)

	# merged_lines.append([lx1, ly1, lx2, ly2])

	# # Final Sort by Y
	# merged_lines.sort(key=lambda r: r[1])
	# return merged_lines


	# def process_image(image):
	# if image is None: return None, [], "Please upload an image."
	# image_np = np.array(image.convert("RGB"))

	# # DETECT
	# try:
	# dt_boxes, _ = detector.text_detector(image_np)
	# except Exception as e:
	# return image, [], f"Detection Error: {str(e)}"

	# if dt_boxes is None or len(dt_boxes) == 0:
	# return image, [], "No text detected."

	# # PROCESS (Filter Nested -> Merge Lines)
	# line_boxes = merge_boxes_into_lines(dt_boxes)

	# annotated_img = image_np.copy()
	# results = []
	# debug_crops = []

	# for box in line_boxes:
	# x1, y1, x2, y2 = map(int, box)

	# # Filter Noise
	# if (x2 - x1) < 20 or (y2 - y1) < 15:
	# continue

	# # Draw (Green)
	# cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)

	# # PADDING
	# PAD = 10
	# h, w, _ = image_np.shape
	# x1 = max(0, x1 - PAD)
	# y1 = max(0, y1 - PAD)
	# x2 = min(w, x2 + PAD)
	# y2 = min(h, y2 + PAD)

	# crop = image_np[y1:y2, x1:x2]
	# pil_crop = Image.fromarray(crop)
	# debug_crops.append(pil_crop)

	# # RECOGNIZE
	# with torch.no_grad():
	# pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
	# generated_ids = model.generate(pixel_values)
	# text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	# if text.strip():
	# results.append(text)

	# full_text = "\n".join(results)
	# return Image.fromarray(annotated_img), debug_crops, full_text

	# # --- UI ---
	# with gr.Blocks(theme=gr.themes.Soft()) as demo:
	# gr.Markdown("# ⚡ Smart Line-Level OCR (Cleaned)")

	# with gr.Row():
	# with gr.Column(scale=1):
	# input_img = gr.Image(type="pil", label="Upload Image")
	# btn = gr.Button("Transcribe", variant="primary")

	# with gr.Column(scale=1):
	# output_img = gr.Image(label="Cleaned Lines (Green Boxes)")
	# output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)

	# with gr.Row():
	# gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)

	# btn.click(process_image, input_img, [output_img, gallery, output_txt])

	# if __name__ == "__main__":
	# demo.launch()
















	# import gradio as gr
	# import torch
	# import numpy as np
	# import cv2
	# from PIL import Image
	# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
	# from paddleocr import PaddleOCR

	# # Setup
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	# print(f"Loading TrOCR on {device}...")
	# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
	# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()

	# print("Loading PaddleOCR...")
	# detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
	# det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)

	# def calculate_iou(box1, box2):
	# """Calculate Intersection over Union"""
	# x1 = max(box1[0], box2[0])
	# y1 = max(box1[1], box2[1])
	# x2 = min(box1[2], box2[2])
	# y2 = min(box1[3], box2[3])

	# if x2 < x1 or y2 < y1:
	# return 0.0

	# intersection = (x2 - x1) * (y2 - y1)
	# area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
	# area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])

	# return intersection / min(area1, area2)

	# def remove_nested_boxes(boxes, iou_thresh=0.7):
	# """Remove boxes that are nested inside others"""
	# if len(boxes) == 0:
	# return []

	# # Add area to each box
	# boxes_with_area = []
	# for b in boxes:
	# area = (b[2] - b[0]) * (b[3] - b[1])
	# boxes_with_area.append((*b, area))

	# # Sort by area descending (keep larger boxes)
	# boxes_with_area.sort(key=lambda x: x[4], reverse=True)

	# keep = []
	# for i, current in enumerate(boxes_with_area):
	# should_keep = True
	# curr_box = current[:4]

	# for kept in keep:
	# iou = calculate_iou(curr_box, kept)
	# if iou > iou_thresh:
	# should_keep = False
	# break

	# if should_keep:
	# keep.append(curr_box)

	# return keep

	# def merge_boxes_into_lines(raw_boxes, y_overlap_thresh=0.5, x_gap_thresh=100):
	# """Merge boxes into lines with better horizontal merging"""
	# if raw_boxes is None or len(raw_boxes) == 0:
	# return []

	# # Convert polygons to rectangles
	# rects = []
	# for box in raw_boxes:
	# box = np.array(box).astype(np.float32)
	# x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
	# x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
	# rects.append([x1, y1, x2, y2])

	# # Remove nested boxes
	# rects = remove_nested_boxes(rects)

	# if len(rects) == 0:
	# return []

	# # Sort by Y position
	# rects.sort(key=lambda r: r[1])

	# # Group into lines based on Y overlap
	# lines = []
	# current_line = [rects[0]]

	# for rect in rects[1:]:
	# # Check if rect belongs to current line
	# line_y1 = min(r[1] for r in current_line)
	# line_y2 = max(r[3] for r in current_line)
	# line_height = line_y2 - line_y1

	# rect_y1, rect_y2 = rect[1], rect[3]
	# rect_height = rect_y2 - rect_y1

	# # Calculate vertical overlap
	# overlap_y1 = max(line_y1, rect_y1)
	# overlap_y2 = min(line_y2, rect_y2)
	# overlap = max(0, overlap_y2 - overlap_y1)

	# # If significant vertical overlap, it's the same line
	# if overlap > y_overlap_thresh * min(line_height, rect_height):
	# current_line.append(rect)
	# else:
	# # Save current line and start new one
	# lines.append(current_line)
	# current_line = [rect]

	# lines.append(current_line)

	# # Merge boxes in each line
	# merged = []
	# for line in lines:
	# # Sort line boxes left to right
	# line.sort(key=lambda r: r[0])

	# # Merge horizontally close boxes
	# merged_line = [line[0]]
	# for rect in line[1:]:
	# last = merged_line[-1]
	# # If close horizontally, merge
	# if rect[0] - last[2] < x_gap_thresh:
	# merged_line[-1] = [
	# min(last[0], rect[0]),
	# min(last[1], rect[1]),
	# max(last[2], rect[2]),
	# max(last[3], rect[3])
	# ]
	# else:
	# merged_line.append(rect)

	# # Final merge: combine all boxes in line into one
	# x1 = min(r[0] for r in merged_line)
	# y1 = min(r[1] for r in merged_line)
	# x2 = max(r[2] for r in merged_line)
	# y2 = max(r[3] for r in merged_line)
	# merged.append([x1, y1, x2, y2])

	# # Sort by Y
	# merged.sort(key=lambda r: r[1])
	# return merged

	# def process_image(image):
	# if image is None:
	# return None, [], "Please upload an image."

	# image_np = np.array(image.convert("RGB"))

	# try:
	# dt_boxes, _ = detector.text_detector(image_np)
	# except Exception as e:
	# return image, [], f"Detection Error: {str(e)}"

	# if dt_boxes is None or len(dt_boxes) == 0:
	# return image, [], "No text detected."

	# line_boxes = merge_boxes_into_lines(dt_boxes)

	# annotated_img = image_np.copy()
	# results = []
	# debug_crops = []

	# for box in line_boxes:
	# x1, y1, x2, y2 = map(int, box)

	# if (x2 - x1) < 20 or (y2 - y1) < 15:
	# continue

	# cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)

	# PAD = 10
	# h, w, _ = image_np.shape
	# x1 = max(0, x1 - PAD)
	# y1 = max(0, y1 - PAD)
	# x2 = min(w, x2 + PAD)
	# y2 = min(h, y2 + PAD)

	# crop = image_np[y1:y2, x1:x2]
	# pil_crop = Image.fromarray(crop)
	# debug_crops.append(pil_crop)

	# with torch.no_grad():
	# pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
	# generated_ids = model.generate(pixel_values)
	# text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	# if text.strip():
	# results.append(text)

	# full_text = "\n".join(results)
	# return Image.fromarray(annotated_img), debug_crops, full_text

	# with gr.Blocks(theme=gr.themes.Soft()) as demo:
	# gr.Markdown("# ⚡ Smart Line-Level OCR (Fixed)")

	# with gr.Row():
	# with gr.Column(scale=1):
	# input_img = gr.Image(type="pil", label="Upload Image")
	# btn = gr.Button("Transcribe", variant="primary")

	# with gr.Column(scale=1):
	# output_img = gr.Image(label="Detected Lines")
	# output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)

	# with gr.Row():
	# gallery = gr.Gallery(label="Line Crops", columns=4, height=200)

	# btn.click(process_image, input_img, [output_img, gallery, output_txt])

	# if __name__ == "__main__":
	# demo.launch()


















	#https://github.com/czczup/FAST


	import gradio as gr
	import torch
	import numpy as np
	import cv2
	from PIL import Image
	from transformers import TrOCRProcessor, VisionEncoderDecoderModel
	from paddleocr import PaddleOCR
	import pandas as pd

	# --- 1. SETUP TR-OCR ---
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Loading TrOCR on {device}...")
	processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
	model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()

	# --- 2. SETUP PADDLEOCR ---
	print("Loading PaddleOCR...")
	# High resolution settings to detect faint text
	detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
	det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)


	# ==========================================
	# 🧠 LOGIC: INTERSECTION OVER UNION (IOU)
	# ==========================================
	def calculate_iou_containment(box1, box2):
	"""
	Calculates how much of box1 is inside box2.
	Returns: ratio (0.0 to 1.0)
	"""
	x1 = max(box1[0], box2[0])
	y1 = max(box1[1], box2[1])
	x2 = min(box1[2], box2[2])
	y2 = min(box1[3], box2[3])

	if x2 < x1 or y2 < y1:
	return 0.0

	intersection = (x2 - x1) * (y2 - y1)
	area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])

	return intersection / area1

	def filter_nested_boxes(boxes, containment_thresh=0.85):
	"""
	Removes boxes that are mostly contained within other larger boxes.
	"""
	if not boxes: return []

	# [x1, y1, x2, y2, area]
	active = []
	for b in boxes:
	area = (b[2] - b[0]) * (b[3] - b[1])
	active.append(list(b) + [area])

	# Sort by Area descending (Biggest first)
	active.sort(key=lambda x: x[4], reverse=True)

	final_boxes = []

	for current in active:
	is_nested = False
	curr_box = current[:4]

	# Check if this box is inside any bigger box we already kept
	for kept in final_boxes:
	overlap_ratio = calculate_iou_containment(curr_box, kept)

	if overlap_ratio > containment_thresh:
	is_nested = True
	break

	if not is_nested:
	final_boxes.append(curr_box)

	return final_boxes


	# ==========================================
	# 🧠 LOGIC: STRICT LINE MERGING
	# ==========================================
	def merge_boxes_into_lines(raw_boxes, log_data):
	"""
	Merges boxes horizontally but prevents vertical merging.
	"""
	if raw_boxes is None or len(raw_boxes) == 0:
	return []

	# 1. Convert to Rects
	rects = []
	for box in raw_boxes:
	box = np.array(box).astype(np.float32)
	x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
	x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
	rects.append([x1, y1, x2, y2])

	log_data.append(f"Raw Detections: {len(rects)} boxes found.")

	# 2. Filter Nested
	rects = filter_nested_boxes(rects)
	log_data.append(f"After Cleaning Nested: {len(rects)} boxes remain.")

	# 3. Sort by Y-Center (Top to Bottom)
	rects.sort(key=lambda r: (r[1] + r[3]) / 2)

	lines = []

	while rects:
	# Start a new line with the highest remaining box
	current_line = [rects.pop(0)]

	# Calculate the dynamic "height" of this line based on the first word
	ref_h = current_line[0][3] - current_line[0][1]
	ref_y_center = (current_line[0][1] + current_line[0][3]) / 2

	# Look for other words on this SAME line
	# STRICT RULE: A box is on the same line ONLY if its Y-center
	# is within 50% of the reference box's height.
	vertical_tolerance = ref_h * 0.5

	remaining_rects = []
	for r in rects:
	r_y_center = (r[1] + r[3]) / 2

	if abs(r_y_center - ref_y_center) < vertical_tolerance:
	current_line.append(r)
	else:
	remaining_rects.append(r)

	rects = remaining_rects

	# Sort words in this line left-to-right
	current_line.sort(key=lambda r: r[0])

	# 4. Merge the horizontal group into ONE box
	lx1 = min(r[0] for r in current_line)
	ly1 = min(r[1] for r in current_line)
	lx2 = max(r[2] for r in current_line)
	ly2 = max(r[3] for r in current_line)

	lines.append([lx1, ly1, lx2, ly2])

	# Final Sort by Y
	lines.sort(key=lambda r: r[1])

	log_data.append(f"Final Merged Lines: {len(lines)} lines created.")
	return lines


	def process_image(image):
	logs = [] # Store debug messages here

	if image is None:
	return None, [], "Please upload an image.", "No logs."

	image_np = np.array(image.convert("RGB"))

	# DETECT
	try:
	dt_boxes, _ = detector.text_detector(image_np)
	except Exception as e:
	return image, [], f"Detection Error: {str(e)}", "\n".join(logs)

	if dt_boxes is None or len(dt_boxes) == 0:
	return image, [], "No text detected.", "\n".join(logs)

	# PROCESS
	line_boxes = merge_boxes_into_lines(dt_boxes, logs)

	annotated_img = image_np.copy()
	results = []
	debug_crops = []

	# Log the final box coordinates for inspection
	logs.append("\n--- Final Box Coordinates ---")

	for i, box in enumerate(line_boxes):
	x1, y1, x2, y2 = map(int, box)

	logs.append(f"Line {i+1}: x={x1}, y={y1}, w={x2-x1}, h={y2-y1}")

	# Filter Noise
	if (x2 - x1) < 20 or (y2 - y1) < 15:
	logs.append(f"-> Skipped Line {i+1} (Too Small/Noise)")
	continue

	# Draw (Green)
	cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)

	# PADDING
	PAD = 10
	h, w, _ = image_np.shape
	x1 = max(0, x1 - PAD)
	y1 = max(0, y1 - PAD)
	x2 = min(w, x2 + PAD)
	y2 = min(h, y2 + PAD)

	crop = image_np[y1:y2, x1:x2]
	pil_crop = Image.fromarray(crop)
	debug_crops.append(pil_crop)

	# RECOGNIZE
	with torch.no_grad():
	pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
	generated_ids = model.generate(pixel_values)
	text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	if text.strip():
	results.append(text)

	full_text = "\n".join(results)
	return Image.fromarray(annotated_img), debug_crops, full_text, "\n".join(logs)

	# --- UI ---
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# ⚡ Smart Line-Level OCR (Debug Mode)")

	with gr.Row():
	with gr.Column(scale=1):
	input_img = gr.Image(type="pil", label="Upload Image")
	btn = gr.Button("Transcribe", variant="primary")

	with gr.Column(scale=1):
	with gr.Tabs():
	with gr.Tab("Visualization"):
	output_img = gr.Image(label="Detected Lines")
	with gr.Tab("Extracted Text"):
	output_txt = gr.Textbox(label="Result", lines=15, show_copy_button=True)
	with gr.Tab("Debug Logs"):
	# CHANGED HERE: Uses Textbox instead of Code to avoid version errors
	log_output = gr.Textbox(label="Processing Logs", lines=20, interactive=False)

	with gr.Row():
	gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)

	btn.click(process_image, input_img, [output_img, gallery, output_txt, log_output])

	if __name__ == "__main__":
	demo.launch()