Spaces:

iammraat
/

yoloocr

Running

App Files Files Community

iammraat commited on Jan 31

Commit

116621e

verified ·

1 Parent(s): 227f1ab

Update app.py

Browse files

Files changed (1) hide show

app.py +249 -546

app.py CHANGED Viewed

@@ -260,607 +260,335 @@
-# import gradio as gr
-# from ultralytics import YOLO
-# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-# from PIL import Image, ImageDraw, ImageFont
-# import torch
-# import logging
-# from datetime import datetime
-# import os
-# import warnings
-# import time
-# # Suppress progress bar and unnecessary logs
-# os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
-# warnings.filterwarnings('ignore')
-# logging.getLogger('transformers').setLevel(logging.ERROR)
-# logging.getLogger('ultralytics').setLevel(logging.ERROR)
-# # Setup logging
-# logging.basicConfig(
-#     level=logging.INFO,
-#     format='%(asctime)s - %(levelname)s - %(message)s'
-# )
-# logger = logging.getLogger(__name__)
-# logger.info("Starting model loading...")
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# logger.info(f"Using device: {device}")
-# # --- ROBUST MODEL LOADING FUNCTION ---
-# def load_model_with_retry(model_class, model_name, token=None, retries=5, delay=5):
-#     """Attempts to load a HF model with retries to handle network timeouts."""
-#     for attempt in range(retries):
-#         try:
-#             logger.info(f"Loading {model_name} (Attempt {attempt + 1}/{retries})...")
-#             if "Processor" in str(model_class):
-#                 return model_class.from_pretrained(model_name, token=token)
-#             else:
-#                 return model_class.from_pretrained(model_name, token=token).to(device)
-#         except Exception as e:
-#             logger.warning(f"Failed to load {model_name}: {e}")
-#             if attempt < retries - 1:
-#                 logger.info(f"Retrying in {delay} seconds...")
-#                 time.sleep(delay)
-#             else:
-#                 logger.error(f"Given up on loading {model_name} after {retries} attempts.")
-#                 raise e
-# try:
-#     # 1. Load YOLO Models (Local Files)
-#     region_model_file = 'regions.pt'
-#     line_model_file = 'lines.pt'
-#     # Simple check for local files
-#     if not os.path.exists(region_model_file):
-#         for file in os.listdir('.'):
-#             if 'region' in file.lower() and file.endswith('.pt'): region_model_file = file
-#             elif 'line' in file.lower() and file.endswith('.pt'): line_model_file = file
-#     if not os.path.exists(region_model_file) or not os.path.exists(line_model_file):
-#         raise FileNotFoundError("YOLO .pt files (regions.pt/lines.pt) not found.")
-#     logger.info("Loading YOLO models...")
-#     region_model = YOLO(region_model_file)
-#     line_model = YOLO(line_model_file)
-#     logger.info("✓ YOLO models loaded")
-#     # 2. Load TrOCR with Retries
-#     hf_token = os.getenv("HF_TOKEN")
-#     processor = load_model_with_retry(TrOCRProcessor, "microsoft/trocr-base-handwritten", token=hf_token)
-#     logger.info("✓ TrOCR processor loaded")
-#     trocr_model = load_model_with_retry(VisionEncoderDecoderModel, "microsoft/trocr-base-handwritten", token=hf_token)
-#     logger.info("✓ TrOCR model loaded")
-#     logger.info("All models loaded successfully!")
-# except Exception as e:
-#     logger.error(f"CRITICAL ERROR loading models: {str(e)}")
-#     raise
-# # --- OCR HELPER ---
-# def run_trocr(image_slice, processor, model, device):
-#     """Runs TrOCR on a single cropped image slice."""
-#     pixel_values = processor(images=image_slice, return_tensors="pt").pixel_values.to(device)
-#     generated_ids = model.generate(pixel_values)
-#     return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-# def process_document(image, enable_debug_crops=False):
-#     """Process uploaded document image and extract handwritten text with visualization."""
-#     timestamp = datetime.now().strftime("%H:%M:%S")
-#     log_output = []
-#     def add_log(message, level="INFO"):
-#         log_msg = f"[{timestamp}] {level}: {message}"
-#         log_output.append(log_msg)
-#         if level == "ERROR":
-#             logger.error(message)
-#         else:
-#             logger.info(message)
-#     add_log("Starting document processing")
-#     if image is None:
-#         add_log("No image provided", "ERROR")
-#         return None, "Please upload an image", "\n".join(log_output)
-#     try:
-#         # Prepare Image
-#         if not isinstance(image, Image.Image):
-#             img = Image.open(image).convert("RGB")
-#         else:
-#             img = image.convert("RGB")
-#         # Create a drawing context for the debug image
-#         debug_img = img.copy()
-#         draw = ImageDraw.Draw(debug_img)
-#         width, height = img.size
-#         add_log(f"Image size: {width}x{height} pixels")
-#         all_lines = []
-#         debug_crops_dir = "debug_crops"
-#         if enable_debug_crops:
-#             os.makedirs(debug_crops_dir, exist_ok=True)
-#             add_log(f"Debug crops will be saved to {debug_crops_dir}/")
-#         # --- STRATEGY 1: Region Detection ---
-#         add_log("Strategy 1: Running region detection...")
-#         region_results = region_model(img, conf=0.2, imgsz=1024, verbose=False)
-#         regions = region_results[0].boxes
-#         num_regions = len(regions)
-#         add_log(f"✓ Found {num_regions} potential text region(s)")
-#         found_lines_in_regions = False
-#         if num_regions > 0:
-#             for region_idx, region in enumerate(regions):
-#                 add_log(f"Processing region {region_idx + 1}/{num_regions}")
-#                 # FIX 1: Use round() instead of int() to minimize precision loss
-#                 rx1, ry1, rx2, ry2 = map(round, region.xyxy[0].tolist())
-#                 # Calculate region dimensions
-#                 region_width = rx2 - rx1
-#                 region_height = ry2 - ry1
-#                 add_log(f"  Region coords: ({rx1}, {ry1}) → ({rx2}, {ry2}), size: {region_width}x{region_height}")
-#                 # Filter small artifacts
-#                 if region_width < 50 or region_height < 50:
-#                     add_log(f"  Skipping tiny artifact: {region_width}x{region_height} px")
-#                     continue
-#                 # FIX 2: Add padding to region crops to avoid edge effects
-#                 padding = 10
-#                 padded_rx1 = max(0, rx1 - padding)
-#                 padded_ry1 = max(0, ry1 - padding)
-#                 padded_rx2 = min(width, rx2 + padding)
-#                 padded_ry2 = min(height, ry2 + padding)
-#                 add_log(f"  Padded coords: ({padded_rx1}, {padded_ry1}) → ({padded_rx2}, {padded_ry2})")
-#                 # Draw GREEN box for Region (original bounds, not padded)
-#                 draw.rectangle([rx1, ry1, rx2, ry2], outline="green", width=5)
-#                 # Crop Region with padding
-#                 region_crop = img.crop((padded_rx1, padded_ry1, padded_rx2, padded_ry2))
-#                 if enable_debug_crops:
-#                     region_crop.save(f"{debug_crops_dir}/region_{region_idx:02d}.png")
-#                 # Detect lines in this region
-#                 add_log(f"  Running line detection on region crop ({region_crop.size[0]}x{region_crop.size[1]})...")
-#                 line_results = line_model(region_crop, conf=0.2, imgsz=1024, verbose=False)
-#                 lines_data = line_results[0].boxes.xyxy.cpu().numpy()
-#                 num_lines = len(lines_data)
-#                 add_log(f"  ✓ Found {num_lines} line(s) in region")
-#                 if num_lines > 0:
-#                     found_lines_in_regions = True
-#                     # Sort lines by Y position (index 1 of xyxy)
-#                     sorted_indices = lines_data[:, 1].argsort()
-#                     for line_idx, idx in enumerate(sorted_indices):
-#                         # FIX 3: Use round() for line coordinates too
-#                         lx1, ly1, lx2, ly2 = map(round, lines_data[idx].tolist())
-#                         line_width = lx2 - lx1
-#                         line_height = ly2 - ly1
-#                         add_log(f"  Line {line_idx + 1} (local coords): ({lx1}, {ly1}) → ({lx2}, {ly2}), size: {line_width}x{line_height}")
-#                         # FIX 4: Translate line coordinates back to original image space
-#                         # Account for padding offset
-#                         global_lx1 = padded_rx1 + lx1
-#                         global_ly1 = padded_ry1 + ly1
-#                         global_lx2 = padded_rx1 + lx2
-#                         global_ly2 = padded_ry1 + ly2
-#                         # FIX 5: Validate coordinates are within image bounds
-#                         global_lx1 = max(0, min(width, global_lx1))
-#                         global_ly1 = max(0, min(height, global_ly1))
-#                         global_lx2 = max(0, min(width, global_lx2))
-#                         global_ly2 = max(0, min(height, global_ly2))
-#                         add_log(f"  Line {line_idx + 1} (global coords): ({global_lx1}, {global_ly1}) → ({global_lx2}, {global_ly2})")
-#                         # Draw RED box for Line
-#                         draw.rectangle([global_lx1, global_ly1, global_lx2, global_ly2], outline="red", width=3)
-#                         # OCR on the line crop from region_crop
-#                         line_crop = region_crop.crop((lx1, ly1, lx2, ly2))
-#                         if enable_debug_crops:
-#                             line_crop.save(f"{debug_crops_dir}/region_{region_idx:02d}_line_{line_idx:02d}.png")
-#                         text = run_trocr(line_crop, processor, trocr_model, device)
-#                         add_log(f"  Line {line_idx + 1} OCR: '{text}'")
-#                         all_lines.append(text)
-#         # --- STRATEGY 2: Fallback to Full Page ---
-#         if not found_lines_in_regions:
-#             add_log("⚠️ Region detection yielded no lines. Switching to Fallback Strategy...", "WARNING")
-#             add_log("Strategy 2: Running line detection on full page")
-#             line_results = line_model(img, conf=0.2, imgsz=1024, verbose=False)
-#             lines_data = line_results[0].boxes.xyxy.cpu().numpy()
-#             num_lines = len(lines_data)
-#             add_log(f"✓ Fallback found {num_lines} line(s) on full page")
-#             if num_lines > 0:
-#                 sorted_indices = lines_data[:, 1].argsort()
-#                 for line_idx, idx in enumerate(sorted_indices):
-#                     # FIX 6: Use round() consistently
-#                     lx1, ly1, lx2, ly2 = map(round, lines_data[idx].tolist())
-#                     line_width = lx2 - lx1
-#                     line_height = ly2 - ly1
-#                     add_log(f"  Fallback Line {line_idx + 1}: ({lx1}, {ly1}) → ({lx2}, {ly2}), size: {line_width}x{line_height}")
-#                     # FIX 7: Validate coordinates
-#                     lx1 = max(0, min(width, lx1))
-#                     ly1 = max(0, min(height, ly1))
-#                     lx2 = max(0, min(width, lx2))
-#                     ly2 = max(0, min(height, ly2))
-#                     # Draw RED box for Line (on full image)
-#                     draw.rectangle([lx1, ly1, lx2, ly2], outline="red", width=3)
-#                     line_crop = img.crop((lx1, ly1, lx2, ly2))
-#                     if enable_debug_crops:
-#                         line_crop.save(f"{debug_crops_dir}/fullpage_line_{line_idx:02d}.png")
-#                     text = run_trocr(line_crop, processor, trocr_model, device)
-#                     add_log(f"  Fallback Line {line_idx + 1} OCR: '{text}'")
-#                     all_lines.append(text)
-#         if not all_lines:
-#             add_log("Failed to detect any text lines in both strategies", "ERROR")
-#             return debug_img, "No text could be extracted.", "\n".join(log_output)
-#         add_log(f"✓ Success! Extracted {len(all_lines)} total line(s)")
-#         if enable_debug_crops:
-#             add_log(f"✓ Debug crops saved to {debug_crops_dir}/")
-#         final_text = '\n'.join(all_lines)
-#         return debug_img, final_text, "\n".join(log_output)
-#     except Exception as e:
-#         error_msg = f"Error processing image: {str(e)}"
-#         add_log(error_msg, "ERROR")
-#         logger.exception("Full error traceback:")
-#         return image, f"Error: {str(e)}", "\n".join(log_output)
-# # Create Gradio interface
-# demo = gr.Interface(
-#     fn=process_document,
-#     inputs=[
-#         gr.Image(type="pil", label="Upload Handwritten Document"),
-#         gr.Checkbox(label="Save debug crops to disk", value=False)
-#     ],
-#     outputs=[
-#         gr.Image(type="pil", label="Debug Visualization (Green=Region, Red=Lines)"),
-#         gr.Textbox(label="Extracted Text", lines=10),
-#         gr.Textbox(label="Processing Logs", lines=15)
-#     ],
-#     title="📝 Handwritten Text Recognition (HTR) with Enhanced Debugging",
-#     description="""
-#     Upload an image of a handwritten document.
-#     **Visualization Key:**
-#     - 🟩 **Green Box:** The broad region identified as containing text (original bounds).
-#     - 🟥 **Red Box:** The specific line of text sent to the OCR engine (with coordinate validation).
-#     **Improvements:**
-#     - Fixed coordinate rounding (eliminates truncation errors)
-#     - Added 10px padding to region crops (reduces edge effects)
-#     - Coordinate validation (ensures all boxes are within image bounds)
-#     - Enhanced logging with detailed coordinate tracking
-#     - Optional debug crop saving
-#     """,
-#     flagging_mode="never",
-#     theme=gr.themes.Soft()
-# )
-# if __name__ == "__main__":
-#     logger.info("Launching Gradio interface...")
-#     demo.launch()
-import gradio as gr
-from ultralytics import YOLO
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-from PIL import Image, ImageDraw
-import torch
-import logging
-import os
-import warnings
-import time
-from datetime import datetime
-# Suppress noisy logs
-os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
-warnings.filterwarnings('ignore')
-logging.getLogger('transformers').setLevel(logging.ERROR)
-logging.getLogger('ultralytics').setLevel(logging.WARNING)  # still allow important warnings
-# Setup clean logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)-5s | %(message)s')
-logger = logging.getLogger(__name__)
-logger.info("Initializing models...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-logger.info(f"Device: {device}")
-def load_with_retry(cls, name, token=None, retries=4, delay=6):
-    for attempt in range(1, retries + 1):
-        try:
-            logger.info(f"Loading {name} (attempt {attempt}/{retries})")
-            if "Processor" in str(cls):
-                return cls.from_pretrained(name, token=token)
-            return cls.from_pretrained(name, token=token).to(device)
-        except Exception as e:
-            logger.warning(f"Load failed: {e}")
-            if attempt < retries:
-                time.sleep(delay)
-    raise RuntimeError(f"Failed to load {name} after {retries} attempts")
-try:
-    # Locate local YOLO weights
-    region_pt = 'regions.pt'
-    line_pt   = 'lines.pt'
-    if not os.path.exists(region_pt):
-        for f in os.listdir('.'):
-            name = f.lower()
-            if 'region' in name and name.endswith('.pt'): region_pt = f
-            if 'line'   in name and name.endswith('.pt'): line_pt   = f
-    if not all(os.path.exists(p) for p in [region_pt, line_pt]):
-        raise FileNotFoundError("Could not find regions.pt and lines.pt (or similar)")
-    logger.info("Loading YOLO models...")
-    region_model = YOLO(region_pt)
-    line_model   = YOLO(line_pt)
-    logger.info("YOLO models loaded")
-    hf_token = os.getenv("HF_TOKEN")
-    processor = load_with_retry(TrOCRProcessor, "microsoft/trocr-base-handwritten", hf_token)
-    trocr     = load_with_retry(VisionEncoderDecoderModel, "microsoft/trocr-base-handwritten", hf_token)
-    logger.info("TrOCR loaded → ready")
-except Exception as e:
-    logger.error(f"Model loading failed: {e}", exc_info=True)
-    raise
-def run_ocr(crop: Image.Image) -> str:
-    if crop.width < 20 or crop.height < 12:
-        return ""
-    pixels = processor(images=crop, return_tensors="pt").pixel_values.to(device)
-    ids = trocr.generate(pixels, max_new_tokens=128)
-    return processor.batch_decode(ids, skip_special_tokens=True)[0].strip()
-def process_document(
-    image,
-    enable_debug_crops: bool = False,
-    region_imgsz: int = 1024,
-    line_imgsz_base: int = 768,
-    conf_thresh: float = 0.25,
-):
-    start_ts = datetime.now().strftime("%H:%M:%S")
-    logs = []
-    def log(msg: str, level: str = "INFO"):
-        line = f"[{start_ts}] {level:5} {msg}"
-        logs.append(line)
-        if level == "ERROR":
-            logger.error(msg)
-        else:
-            logger.info(msg)
-    log("Start processing")
-    if image is None:
-        log("No image uploaded", "ERROR")
-        return None, "Upload an image", "\n".join(logs)
-    try:
-        # ── Prepare ─────────────────────────────────────────────────────────────
-        if not isinstance(image, Image.Image):
-            img = Image.open(image).convert("RGB")
-        else:
-            img = image.convert("RGB")
-        debug_img = img.copy()
-        draw = ImageDraw.Draw(debug_img)
-        w, h = img.size
-        log(f"Input image: {w} × {h} px")
-        debug_dir = "debug_crops"
-        if enable_debug_crops:
-            os.makedirs(debug_dir, exist_ok=True)
-            log(f"Debug crops → {debug_dir}/")
-        extracted = []
-        used_fallback = False
-        # ── Strategy 1: Region → Lines ──────────────────────────────────────────
-        log(f"Running region detection (imgsz={region_imgsz}) …")
-        res_region = region_model(img, conf=conf_thresh, imgsz=region_imgsz, verbose=False)[0]
-        boxes_region = res_region.boxes
-        log(f"→ {len(boxes_region)} region candidate(s) (conf ≥ {conf_thresh})")
-        found_any_line = False
-        for i, box in enumerate(boxes_region, 1):
-            conf = float(box.conf)
-            xyxy = box.xyxy[0].cpu().tolist()
-            rx1, ry1, rx2, ry2 = map(round, xyxy)
-            rw, rh = rx2 - rx1, ry2 - ry1
-            log(f"Region {i}/{len(boxes_region)}  conf={conf:.3f}  {rx1},{ry1} → {rx2},{ry2}  ({rw}×{rh})")
-            if rw < 60 or rh < 40:
-                log(f"  → skipped (too small)")
-                continue
-            # Padding
-            pad = 12
-            px1 = max(0, rx1 - pad)
-            py1 = max(0, ry1 - pad)
-            px2 = min(w, rx2 + pad)
-            py2 = min(h, ry2 + pad)
-            log(f"  Padded crop: {px1},{py1} → {px2},{py2}")
-            draw.rectangle((rx1, ry1, rx2, ry2), outline="green", width=4)
-            crop_region = img.crop((px1, py1, px2, py2))
-            crop_w, crop_h = crop_region.size
-            if enable_debug_crops:
-                crop_region.save(f"{debug_dir}/region_{i:02d}.png")
-            # Adaptive line imgsz: bigger crops → bigger inference size
-            line_sz = line_imgsz_base
-            if max(crop_w, crop_h) > 1400:
-                line_sz = 1280
-            elif max(crop_w, crop_h) < 400:
-                line_sz = 640
-            log(f"  → line detection (imgsz={line_sz}) on {crop_w}×{crop_h} crop …")
-            res_line = line_model(crop_region, conf=conf_thresh, imgsz=line_sz, verbose=False)[0]
-            line_boxes = res_line.boxes
-            log(f"  → {len(line_boxes)} line candidate(s)")
-            if len(line_boxes) == 0:
-                continue
-            found_any_line = True
-            # Sort top → bottom
-            ys = line_boxes.xyxy[:, 1].cpu().numpy()
-            order = ys.argsort()
-            for j, idx in enumerate(order, 1):
-                conf_line = float(line_boxes.conf[idx])
-                lx1, ly1, lx2, ly2 = map(round, line_boxes.xyxy[idx].cpu().tolist())
-                lw, lh = lx2 - lx1, ly2 - ly1
-                log(f"    Line {j}  conf={conf_line:.3f}  local {lx1},{ly1} → {lx2},{ly2}  ({lw}×{lh})")
-                # Back to global coordinates
-                gx1 = px1 + lx1
-                gy1 = py1 + ly1
-                gx2 = px1 + lx2
-                gy2 = py1 + ly2
-                # Safety clamp
-                gx1, gy1 = max(0, gx1), max(0, gy1)
-                gx2, gy2 = min(w, gx2), min(h, gy2)
-                log(f"    → global {gx1},{gy1} → {gx2},{gy2}")
-                draw.rectangle((gx1, gy1, gx2, gy2), outline="red", width=3)
-                line_crop = crop_region.crop((lx1, ly1, lx2, ly2))
-                if enable_debug_crops:
-                    line_crop.save(f"{debug_dir}/reg{i:02d}_line{j:02d}_conf{conf_line:.2f}.png")
-                text = run_ocr(line_crop)
-                log(f"    OCR → '{text}'")
-                if text:
-                    extracted.append(text)
-        # ── Strategy 2: Fallback full-page line detection ───────────────────────
-        if not found_any_line:
-            used_fallback = True
-            log("No lines found in regions → fallback: full-page line detection")
-            line_sz = 1024 if max(w, h) > 1800 else line_imgsz_base
-            log(f"Full-page line detection (imgsz={line_sz}) …")
-            res = line_model(img, conf=conf_thresh, imgsz=line_sz, verbose=False)[0]
-            boxes = res.boxes
-            log(f"→ {len(boxes)} line(s) on full page")
-            if len(boxes) > 0:
-                ys = boxes.xyxy[:, 1].cpu().numpy()
-                order = ys.argsort()
-                for j, idx in enumerate(order, 1):
-                    conf = float(boxes.conf[idx])
-                    x1, y1, x2, y2 = map(round, boxes.xyxy[idx].cpu().tolist())
-                    log(f"  Line {j}  conf={conf:.3f}  {x1},{y1} → {x2},{y2}")
-                    draw.rectangle((x1,y1,x2,y2), outline="red", width=3)
-                    crop = img.crop((x1,y1,x2,y2))
-                    if enable_debug_crops:
-                        crop.save(f"{debug_dir}/fallback_line{j:02d}_conf{conf:.2f}.png")
-                    text = run_ocr(crop)
-                    log(f"  OCR → '{text}'")
-                    if text:
-                        extracted.append(text)
-        # ── Finalize ────────────────────────────────────────────────────────────
-        if not extracted:
-            msg = "No readable text lines detected in either strategy"
-            log(msg, "WARNING")
-            return debug_img, msg, "\n".join(logs)
-        log(f"Success — extracted {len(extracted)} line(s)")
-        if enable_debug_crops:
-            log(f"Debug crops saved to {debug_dir}/")
-        return debug_img, "\n".join(extracted), "\n".join(logs)
-    except Exception as e:
-        log(f"Processing failed: {e}", "ERROR")
-        logger.exception("Traceback:")
-        return debug_img, f"Error: {str(e)}", "\n".join(logs)
@@ -868,29 +596,4 @@ def process_document(
-demo = gr.Interface(
-    fn=process_document,
-    inputs=[
-        gr.Image(type="pil", label="Handwritten document"),
-        gr.Checkbox(label="Save debug crops", value=False),
-        gr.Slider(640, 1600, step=64, value=1024, label="Region detection size (imgsz)"),
-        gr.Slider(512, 1280, step=64, value=768,  label="Base line detection size"),
-        gr.Slider(0.15, 0.5, step=0.05, value=0.25, label="Confidence threshold"),
-    ],
-    outputs=[
-        gr.Image(label="Debug (green=region, red=line)"),
-        gr.Textbox(label="Extracted Text", lines=10),
-        gr.Textbox(label="Detailed Logs (copy these if boxes look wrong)", lines=18),
-    ],
-    title="Handwritten Text → OCR + Debug",
-    description=(
-        "Green = detected text regions  •  Red = individual text lines sent to TrOCR\n\n"
-        "Copy the **Detailed Logs** if alignment still looks off — especially coords, sizes & confidences."
-    ),
-    theme=gr.themes.Soft(),
-    flagging_mode="never",
-)
-if __name__ == "__main__":
-    logger.info("Launching interface…")
-    demo.launch()

+import gradio as gr
+from ultralytics import YOLO
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from PIL import Image, ImageDraw, ImageFont
+import torch
+import logging
+from datetime import datetime
+import os
+import warnings
+import time
+# Suppress progress bar and unnecessary logs
+os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
+warnings.filterwarnings('ignore')
+logging.getLogger('transformers').setLevel(logging.ERROR)
+logging.getLogger('ultralytics').setLevel(logging.ERROR)
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+logger.info("Starting model loading...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"Using device: {device}")
+# --- ROBUST MODEL LOADING FUNCTION ---
+def load_model_with_retry(model_class, model_name, token=None, retries=5, delay=5):
+    """Attempts to load a HF model with retries to handle network timeouts."""
+    for attempt in range(retries):
+        try:
+            logger.info(f"Loading {model_name} (Attempt {attempt + 1}/{retries})...")
+            if "Processor" in str(model_class):
+                return model_class.from_pretrained(model_name, token=token)
+            else:
+                return model_class.from_pretrained(model_name, token=token).to(device)
+        except Exception as e:
+            logger.warning(f"Failed to load {model_name}: {e}")
+            if attempt < retries - 1:
+                logger.info(f"Retrying in {delay} seconds...")
+                time.sleep(delay)
+            else:
+                logger.error(f"Given up on loading {model_name} after {retries} attempts.")
+                raise e
+try:
+    # 1. Load YOLO Models (Local Files)
+    region_model_file = 'regions.pt'
+    line_model_file = 'lines.pt'
+    # Simple check for local files
+    if not os.path.exists(region_model_file):
+        for file in os.listdir('.'):
+            if 'region' in file.lower() and file.endswith('.pt'): region_model_file = file
+            elif 'line' in file.lower() and file.endswith('.pt'): line_model_file = file
+    if not os.path.exists(region_model_file) or not os.path.exists(line_model_file):
+        raise FileNotFoundError("YOLO .pt files (regions.pt/lines.pt) not found.")
+    logger.info("Loading YOLO models...")
+    region_model = YOLO(region_model_file)
+    line_model = YOLO(line_model_file)
+    logger.info("✓ YOLO models loaded")
+    # 2. Load TrOCR with Retries
+    hf_token = os.getenv("HF_TOKEN")
+    processor = load_model_with_retry(TrOCRProcessor, "microsoft/trocr-base-handwritten", token=hf_token)
+    logger.info("✓ TrOCR processor loaded")
+    trocr_model = load_model_with_retry(VisionEncoderDecoderModel, "microsoft/trocr-base-handwritten", token=hf_token)
+    logger.info("✓ TrOCR model loaded")
+    logger.info("All models loaded successfully!")
+except Exception as e:
+    logger.error(f"CRITICAL ERROR loading models: {str(e)}")
+    raise
+# --- OCR HELPER ---
+def run_trocr(image_slice, processor, model, device):
+    """Runs TrOCR on a single cropped image slice."""
+    pixel_values = processor(images=image_slice, return_tensors="pt").pixel_values.to(device)
+    generated_ids = model.generate(pixel_values)
+    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+def process_document(image, enable_debug_crops=False):
+    """Process uploaded document image and extract handwritten text with visualization."""
+    timestamp = datetime.now().strftime("%H:%M:%S")
+    log_output = []
+    def add_log(message, level="INFO"):
+        log_msg = f"[{timestamp}] {level}: {message}"
+        log_output.append(log_msg)
+        if level == "ERROR":
+            logger.error(message)
+        else:
+            logger.info(message)
+    add_log("Starting document processing")
+    if image is None:
+        add_log("No image provided", "ERROR")
+        return None, "Please upload an image", "\n".join(log_output)
+    try:
+        # Prepare Image
+        if not isinstance(image, Image.Image):
+            img = Image.open(image).convert("RGB")
+        else:
+            img = image.convert("RGB")
+        # Create a drawing context for the debug image
+        debug_img = img.copy()
+        draw = ImageDraw.Draw(debug_img)
+        width, height = img.size
+        add_log(f"Image size: {width}x{height} pixels")
+        all_lines = []
+        debug_crops_dir = "debug_crops"
+        if enable_debug_crops:
+            os.makedirs(debug_crops_dir, exist_ok=True)
+            add_log(f"Debug crops will be saved to {debug_crops_dir}/")
+        # --- STRATEGY 1: Region Detection ---
+        add_log("Strategy 1: Running region detection...")
+        region_results = region_model(img, conf=0.2, imgsz=1024, verbose=False)
+        regions = region_results[0].boxes
+        num_regions = len(regions)
+        add_log(f"✓ Found {num_regions} potential text region(s)")
+        found_lines_in_regions = False
+        if num_regions > 0:
+            for region_idx, region in enumerate(regions):
+                add_log(f"Processing region {region_idx + 1}/{num_regions}")
+                # FIX 1: Use round() instead of int() to minimize precision loss
+                rx1, ry1, rx2, ry2 = map(round, region.xyxy[0].tolist())
+                # Calculate region dimensions
+                region_width = rx2 - rx1
+                region_height = ry2 - ry1
+                add_log(f"  Region coords: ({rx1}, {ry1}) → ({rx2}, {ry2}), size: {region_width}x{region_height}")
+                # Filter small artifacts
+                if region_width < 50 or region_height < 50:
+                    add_log(f"  Skipping tiny artifact: {region_width}x{region_height} px")
+                    continue
+                # FIX 2: Add padding to region crops to avoid edge effects
+                padding = 10
+                padded_rx1 = max(0, rx1 - padding)
+                padded_ry1 = max(0, ry1 - padding)
+                padded_rx2 = min(width, rx2 + padding)
+                padded_ry2 = min(height, ry2 + padding)
+                add_log(f"  Padded coords: ({padded_rx1}, {padded_ry1}) → ({padded_rx2}, {padded_ry2})")
+                # Draw GREEN box for Region (original bounds, not padded)
+                draw.rectangle([rx1, ry1, rx2, ry2], outline="green", width=5)
+                # Crop Region with padding
+                region_crop = img.crop((padded_rx1, padded_ry1, padded_rx2, padded_ry2))
+                if enable_debug_crops:
+                    region_crop.save(f"{debug_crops_dir}/region_{region_idx:02d}.png")
+                # Detect lines in this region
+                add_log(f"  Running line detection on region crop ({region_crop.size[0]}x{region_crop.size[1]})...")
+                line_results = line_model(region_crop, conf=0.2, imgsz=1024, verbose=False)
+                lines_data = line_results[0].boxes.xyxy.cpu().numpy()
+                num_lines = len(lines_data)
+                add_log(f"  ✓ Found {num_lines} line(s) in region")
+                if num_lines > 0:
+                    found_lines_in_regions = True
+                    # Sort lines by Y position (index 1 of xyxy)
+                    sorted_indices = lines_data[:, 1].argsort()
+                    for line_idx, idx in enumerate(sorted_indices):
+                        # FIX 3: Use round() for line coordinates too
+                        lx1, ly1, lx2, ly2 = map(round, lines_data[idx].tolist())
+                        line_width = lx2 - lx1
+                        line_height = ly2 - ly1
+                        add_log(f"  Line {line_idx + 1} (local coords): ({lx1}, {ly1}) → ({lx2}, {ly2}), size: {line_width}x{line_height}")
+                        # FIX 4: Translate line coordinates back to original image space
+                        # Account for padding offset
+                        global_lx1 = padded_rx1 + lx1
+                        global_ly1 = padded_ry1 + ly1
+                        global_lx2 = padded_rx1 + lx2
+                        global_ly2 = padded_ry1 + ly2
+                        # FIX 5: Validate coordinates are within image bounds
+                        global_lx1 = max(0, min(width, global_lx1))
+                        global_ly1 = max(0, min(height, global_ly1))
+                        global_lx2 = max(0, min(width, global_lx2))
+                        global_ly2 = max(0, min(height, global_ly2))
+                        add_log(f"  Line {line_idx + 1} (global coords): ({global_lx1}, {global_ly1}) → ({global_lx2}, {global_ly2})")
+                        # Draw RED box for Line
+                        draw.rectangle([global_lx1, global_ly1, global_lx2, global_ly2], outline="red", width=3)
+                        # OCR on the line crop from region_crop
+                        line_crop = region_crop.crop((lx1, ly1, lx2, ly2))
+                        if enable_debug_crops:
+                            line_crop.save(f"{debug_crops_dir}/region_{region_idx:02d}_line_{line_idx:02d}.png")
+                        text = run_trocr(line_crop, processor, trocr_model, device)
+                        add_log(f"  Line {line_idx + 1} OCR: '{text}'")
+                        all_lines.append(text)
+        # --- STRATEGY 2: Fallback to Full Page ---
+        if not found_lines_in_regions:
+            add_log("⚠️ Region detection yielded no lines. Switching to Fallback Strategy...", "WARNING")
+            add_log("Strategy 2: Running line detection on full page")
+            line_results = line_model(img, conf=0.2, imgsz=1024, verbose=False)
+            lines_data = line_results[0].boxes.xyxy.cpu().numpy()
+            num_lines = len(lines_data)
+            add_log(f"✓ Fallback found {num_lines} line(s) on full page")
+            if num_lines > 0:
+                sorted_indices = lines_data[:, 1].argsort()
+                for line_idx, idx in enumerate(sorted_indices):
+                    # FIX 6: Use round() consistently
+                    lx1, ly1, lx2, ly2 = map(round, lines_data[idx].tolist())
+                    line_width = lx2 - lx1
+                    line_height = ly2 - ly1
+                    add_log(f"  Fallback Line {line_idx + 1}: ({lx1}, {ly1}) → ({lx2}, {ly2}), size: {line_width}x{line_height}")
+                    # FIX 7: Validate coordinates
+                    lx1 = max(0, min(width, lx1))
+                    ly1 = max(0, min(height, ly1))
+                    lx2 = max(0, min(width, lx2))
+                    ly2 = max(0, min(height, ly2))
+                    # Draw RED box for Line (on full image)
+                    draw.rectangle([lx1, ly1, lx2, ly2], outline="red", width=3)
+                    line_crop = img.crop((lx1, ly1, lx2, ly2))
+                    if enable_debug_crops:
+                        line_crop.save(f"{debug_crops_dir}/fullpage_line_{line_idx:02d}.png")
+                    text = run_trocr(line_crop, processor, trocr_model, device)
+                    add_log(f"  Fallback Line {line_idx + 1} OCR: '{text}'")
+                    all_lines.append(text)
+        if not all_lines:
+            add_log("Failed to detect any text lines in both strategies", "ERROR")
+            return debug_img, "No text could be extracted.", "\n".join(log_output)
+        add_log(f"✓ Success! Extracted {len(all_lines)} total line(s)")
+        if enable_debug_crops:
+            add_log(f"✓ Debug crops saved to {debug_crops_dir}/")
+        final_text = '\n'.join(all_lines)
+        return debug_img, final_text, "\n".join(log_output)
+    except Exception as e:
+        error_msg = f"Error processing image: {str(e)}"
+        add_log(error_msg, "ERROR")
+        logger.exception("Full error traceback:")
+        return image, f"Error: {str(e)}", "\n".join(log_output)
+# Create Gradio interface
+demo = gr.Interface(
+    fn=process_document,
+    inputs=[
+        gr.Image(type="pil", label="Upload Handwritten Document"),
+        gr.Checkbox(label="Save debug crops to disk", value=False)
+    ],
+    outputs=[
+        gr.Image(type="pil", label="Debug Visualization (Green=Region, Red=Lines)"),
+        gr.Textbox(label="Extracted Text", lines=10),
+        gr.Textbox(label="Processing Logs", lines=15)
+    ],
+    title="📝 Handwritten Text Recognition (HTR) with Enhanced Debugging",
+    description="""
+    Upload an image of a handwritten document.
+    **Visualization Key:**
+    - 🟩 **Green Box:** The broad region identified as containing text (original bounds).
+    - 🟥 **Red Box:** The specific line of text sent to the OCR engine (with coordinate validation).
+    **Improvements:**
+    - Fixed coordinate rounding (eliminates truncation errors)
+    - Added 10px padding to region crops (reduces edge effects)
+    - Coordinate validation (ensures all boxes are within image bounds)
+    - Enhanced logging with detailed coordinate tracking
+    - Optional debug crop saving
+    """,
+    flagging_mode="never",
+    theme=gr.themes.Soft()
+)
+if __name__ == "__main__":
+    logger.info("Launching Gradio interface...")
+    demo.launch()