Spaces:

iammraat
/

test

Sleeping

App Files Files Community

iammraat commited on 14 days ago

Commit

f9d61bf

verified ·

1 Parent(s): e82df7f

Update app.py

Browse files

Files changed (1) hide show

app.py +362 -149

app.py CHANGED Viewed

@@ -1,212 +1,425 @@
-import gradio as gr
-from ultralytics import YOLO
-from PIL import Image, ImageDraw, ImageFont
-import torch
-import logging
-import os
-from datetime import datetime
-# # ── Quiet startup ───────────────────────────────────────────────────────
 # os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
 # logging.getLogger('ultralytics').setLevel(logging.WARNING)
 # logging.basicConfig(
 #     level=logging.INFO,
-#     format='%(asctime)s | %(level)-5s | %(message)s'
 # )
 # logger = logging.getLogger(__name__)
 os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
 logging.getLogger('ultralytics').setLevel(logging.WARNING)
-# FIXED logging format: use levelname, not level
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s | %(levelname)-5s | %(message)s',   # ← changed level → levelname
-    datefmt='%Y-%m-%d %H:%M:%S'
-)
 logger = logging.getLogger(__name__)
-logger.info("Initializing region detector...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 logger.info(f"Device: {device}")
-# ── Load YOLO ───────────────────────────────────────────────────────────
 try:
-    region_pt = 'regions.pt'
-    if not os.path.exists(region_pt):
         for f in os.listdir('.'):
             name = f.lower()
-            if name.endswith('.pt') and 'region' in name:
-                region_pt = f
                 break
-    if not os.path.exists(region_pt):
-        raise FileNotFoundError("No regions.pt (or similar *.pt) found in current directory")
-    logger.info(f"Loading model: {region_pt}")
-    model = YOLO(region_pt)
-    logger.info("Region detector loaded")
 except Exception as e:
-    logger.error(f"Model loading failed → {e}", exc_info=True)
     raise
-def visualize_regions(
     image,
     conf_thresh: float = 0.25,
-    min_size: int = 60,
-    padding: int = 0,
-    show_labels: bool = True,
-    save_debug_crops: bool = False,
-    imgsz: int = 1024,
 ):
-    start = datetime.now().strftime("%H:%M:%S")
-    logs = [f"[{start}] Processing started"]
     if image is None:
-        logs.append("No image uploaded")
-        return None, "\n".join(logs)
-    # Load & convert
-    if isinstance(image, str):
-        img = Image.open(image).convert("RGB")
-    else:
-        img = image.convert("RGB")
-    w, h = img.size
-    logs.append(f"Image size: {w} × {h}")
-    debug_img = img.copy()
-    draw = ImageDraw.Draw(debug_img)
-    try:
-        # Font for drawing labels (fallback to default)
-        try:
-            font = ImageFont.truetype("arial.ttf", 18)
-        except:
-            font = ImageFont.load_default()
-        # ── Run detection ───────────────────────────────────────────────
-        results = model(
-            img,
-            conf=conf_thresh,
-            imgsz=imgsz,
-            verbose=False
-        )[0]
-        boxes = results.boxes
-        logs.append(f"Detected {len(boxes)} region candidate(s)")
-        kept = 0
-        # Sort top → bottom
-        if len(boxes) > 0:
-            ys = boxes.xyxy[:, 1].cpu().numpy()
-            order = ys.argsort()
-            for idx in order:
-                box = boxes[idx]
-                conf = float(box.conf)
-                if conf < conf_thresh:
-                    continue
-                x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
-                bw, bh = x2 - x1, y2 - y1
-                if bw < min_size or bh < min_size:
-                    continue
-                # Optional padding (mostly for crop saving)
-                px1 = max(0, x1 - padding)
-                py1 = max(0, y1 - padding)
-                px2 = min(w, x2 + padding)
-                py2 = min(h, y2 + padding)
-                # Draw box
-                draw.rectangle((x1, y1, x2, y2), outline="lime", width=3)
-                if show_labels:
-                    label = f"conf {conf:.2f}  {bw}×{bh}"
-                    tw, th = draw.textbbox((0,0), label, font=font)[2:]
-                    draw.rectangle(
-                        (x1, y1 - th - 4, x1 + tw + 8, y1),
-                        fill=(0, 180, 0, 160)
-                    )
-                    draw.text((x1 + 4, y1 - th - 2), label, fill="white", font=font)
-                kept += 1
-                # Optional: save individual crops
-                if save_debug_crops:
-                    os.makedirs("debug_regions", exist_ok=True)
-                    crop = img.crop((px1, py1, px2, py2))
-                    fname = f"debug_regions/r{kept:02d}_conf{conf:.2f}_{bw}x{bh}.png"
-                    crop.save(fname)
-                    logs.append(f"Saved crop → {fname}")
-        if kept == 0:
-            msg = f"No regions kept after filters (conf ≥ {conf_thresh}, size ≥ {min_size}px)"
-            logs.append(msg)
-        else:
-            logs.append(f"Visualized {kept} region(s)")
-        logs.append("Finished.")
-        return debug_img, "\n".join(logs)
-    except Exception as e:
-        logs.append(f"Error during inference: {str(e)}")
-        logger.exception("Inference failed")
-        return debug_img, "\n".join(logs)
-# ── Gradio Interface ────────────────────────────────────────────────────
 demo = gr.Interface(
-    fn=visualize_regions,
     inputs=[
-        gr.Image(type="pil", label="Upload image (handwritten document)"),
-        gr.Slider(0.10, 0.60, step=0.02, value=0.25, label="Confidence threshold"),
-        gr.Slider(30,  300,  step=10,  value=60,   label="Minimum region width/height (px)"),
-        gr.Slider(0,   40,   step=4,   value=0,    label="Padding around box (for crops only)"),
-        gr.Checkbox(label="Draw confidence + size labels on boxes", value=True),
-        gr.Checkbox(label="Save individual region crops to debug_regions/", value=False),
-        gr.Slider(640, 1280, step=64, value=1024, label="Inference image size (imgsz)"),
     ],
     outputs=[
-        gr.Image(label="Detected text regions (green boxes)"),
-        gr.Textbox(label="Log / debug info", lines=14),
     ],
-    title="Region Detector Debug View",
     description=(
-        "Only shows what the region YOLO model sees.\n\n"
-        "• Green boxes = detected text regions\n"
-        "• Tune confidence and min size until boxes look reasonable\n"
-        "• Use logs to see exact confidences and sizes\n"
-        "• Save crops if you want to manually check what is being detected"
     ),
-    # theme=gr.themes.Soft(),          # ← comment out or remove (moved to launch)
-    # allow_flagging="never",          # ← remove this line completely
 )
 if __name__ == "__main__":
-    logger.info("Launching debug interface...")
     demo.launch()

+# import gradio as gr
+# from ultralytics import YOLO
+# from PIL import Image, ImageDraw, ImageFont
+# import torch
+# import logging
+# import os
+# from datetime import datetime
+# # # ── Quiet startup ───────────────────────────────────────────────────────
+# # os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
+# # logging.getLogger('ultralytics').setLevel(logging.WARNING)
+# # logging.basicConfig(
+# #     level=logging.INFO,
+# #     format='%(asctime)s | %(level)-5s | %(message)s'
+# # )
+# # logger = logging.getLogger(__name__)
 # os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
 # logging.getLogger('ultralytics').setLevel(logging.WARNING)
+# # FIXED logging format: use levelname, not level
 # logging.basicConfig(
 #     level=logging.INFO,
+#     format='%(asctime)s | %(levelname)-5s | %(message)s',   # ← changed level → levelname
+#     datefmt='%Y-%m-%d %H:%M:%S'
 # )
 # logger = logging.getLogger(__name__)
+# logger.info("Initializing region detector...")
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# logger.info(f"Device: {device}")
+# # ── Load YOLO ───────────────────────────────────────────────────────────
+# try:
+#     region_pt = 'regions.pt'
+#     if not os.path.exists(region_pt):
+#         for f in os.listdir('.'):
+#             name = f.lower()
+#             if name.endswith('.pt') and 'region' in name:
+#                 region_pt = f
+#                 break
+#     if not os.path.exists(region_pt):
+#         raise FileNotFoundError("No regions.pt (or similar *.pt) found in current directory")
+#     logger.info(f"Loading model: {region_pt}")
+#     model = YOLO(region_pt)
+#     logger.info("Region detector loaded")
+# except Exception as e:
+#     logger.error(f"Model loading failed → {e}", exc_info=True)
+#     raise
+# def visualize_regions(
+#     image,
+#     conf_thresh: float = 0.25,
+#     min_size: int = 60,
+#     padding: int = 0,
+#     show_labels: bool = True,
+#     save_debug_crops: bool = False,
+#     imgsz: int = 1024,
+# ):
+#     start = datetime.now().strftime("%H:%M:%S")
+#     logs = [f"[{start}] Processing started"]
+#     if image is None:
+#         logs.append("No image uploaded")
+#         return None, "\n".join(logs)
+#     # Load & convert
+#     if isinstance(image, str):
+#         img = Image.open(image).convert("RGB")
+#     else:
+#         img = image.convert("RGB")
+#     w, h = img.size
+#     logs.append(f"Image size: {w} × {h}")
+#     debug_img = img.copy()
+#     draw = ImageDraw.Draw(debug_img)
+#     try:
+#         # Font for drawing labels (fallback to default)
+#         try:
+#             font = ImageFont.truetype("arial.ttf", 18)
+#         except:
+#             font = ImageFont.load_default()
+#         # ── Run detection ───────────────────────────────────────────────
+#         results = model(
+#             img,
+#             conf=conf_thresh,
+#             imgsz=imgsz,
+#             verbose=False
+#         )[0]
+#         boxes = results.boxes
+#         logs.append(f"Detected {len(boxes)} region candidate(s)")
+#         kept = 0
+#         # Sort top → bottom
+#         if len(boxes) > 0:
+#             ys = boxes.xyxy[:, 1].cpu().numpy()
+#             order = ys.argsort()
+#             for idx in order:
+#                 box = boxes[idx]
+#                 conf = float(box.conf)
+#                 if conf < conf_thresh:
+#                     continue
+#                 x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
+#                 bw, bh = x2 - x1, y2 - y1
+#                 if bw < min_size or bh < min_size:
+#                     continue
+#                 # Optional padding (mostly for crop saving)
+#                 px1 = max(0, x1 - padding)
+#                 py1 = max(0, y1 - padding)
+#                 px2 = min(w, x2 + padding)
+#                 py2 = min(h, y2 + padding)
+#                 # Draw box
+#                 draw.rectangle((x1, y1, x2, y2), outline="lime", width=3)
+#                 if show_labels:
+#                     label = f"conf {conf:.2f}  {bw}×{bh}"
+#                     tw, th = draw.textbbox((0,0), label, font=font)[2:]
+#                     draw.rectangle(
+#                         (x1, y1 - th - 4, x1 + tw + 8, y1),
+#                         fill=(0, 180, 0, 160)
+#                     )
+#                     draw.text((x1 + 4, y1 - th - 2), label, fill="white", font=font)
+#                 kept += 1
+#                 # Optional: save individual crops
+#                 if save_debug_crops:
+#                     os.makedirs("debug_regions", exist_ok=True)
+#                     crop = img.crop((px1, py1, px2, py2))
+#                     fname = f"debug_regions/r{kept:02d}_conf{conf:.2f}_{bw}x{bh}.png"
+#                     crop.save(fname)
+#                     logs.append(f"Saved crop → {fname}")
+#         if kept == 0:
+#             msg = f"No regions kept after filters (conf ≥ {conf_thresh}, size ≥ {min_size}px)"
+#             logs.append(msg)
+#         else:
+#             logs.append(f"Visualized {kept} region(s)")
+#         logs.append("Finished.")
+#         return debug_img, "\n".join(logs)
+#     except Exception as e:
+#         logs.append(f"Error during inference: {str(e)}")
+#         logger.exception("Inference failed")
+#         return debug_img, "\n".join(logs)
+# # ── Gradio Interface ────────────────────────────────────────────────────
+# demo = gr.Interface(
+#     fn=visualize_regions,
+#     inputs=[
+#         gr.Image(type="pil", label="Upload image (handwritten document)"),
+#         gr.Slider(0.10, 0.60, step=0.02, value=0.25, label="Confidence threshold"),
+#         gr.Slider(30,  300,  step=10,  value=60,   label="Minimum region width/height (px)"),
+#         gr.Slider(0,   40,   step=4,   value=0,    label="Padding around box (for crops only)"),
+#         gr.Checkbox(label="Draw confidence + size labels on boxes", value=True),
+#         gr.Checkbox(label="Save individual region crops to debug_regions/", value=False),
+#         gr.Slider(640, 1280, step=64, value=1024, label="Inference image size (imgsz)"),
+#     ],
+#     outputs=[
+#         gr.Image(label="Detected text regions (green boxes)"),
+#         gr.Textbox(label="Log / debug info", lines=14),
+#     ],
+#     title="Region Detector Debug View",
+#     description=(
+#         "Only shows what the region YOLO model sees.\n\n"
+#         "• Green boxes = detected text regions\n"
+#         "• Tune confidence and min size until boxes look reasonable\n"
+#         "• Use logs to see exact confidences and sizes\n"
+#         "• Save crops if you want to manually check what is being detected"
+#     ),
+#     # theme=gr.themes.Soft(),          # ← comment out or remove (moved to launch)
+#     # allow_flagging="never",          # ← remove this line completely
+# )
+# if __name__ == "__main__":
+#     logger.info("Launching debug interface...")
+#     demo.launch()
+import gradio as gr
+from ultralytics import YOLO
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from PIL import Image, ImageDraw
+import torch
+import logging
+import os
+import warnings
+import time
+from datetime import datetime
+# ── Suppress noisy logs ──────────────────────────────────────────────────────
 os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
+warnings.filterwarnings('ignore')
+logging.getLogger('transformers').setLevel(logging.ERROR)
 logging.getLogger('ultralytics').setLevel(logging.WARNING)
+# Clean logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)-5s | %(message)s')
 logger = logging.getLogger(__name__)
+logger.info("Initializing models...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 logger.info(f"Device: {device}")
+def load_with_retry(cls, name, token=None, retries=4, delay=6):
+    for attempt in range(1, retries + 1):
+        try:
+            logger.info(f"Loading {name} (attempt {attempt}/{retries})")
+            if "Processor" in str(cls):
+                return cls.from_pretrained(name, token=token)
+            return cls.from_pretrained(name, token=token).to(device)
+        except Exception as e:
+            logger.warning(f"Load failed: {e}")
+            if attempt < retries:
+                time.sleep(delay)
+    raise RuntimeError(f"Failed to load {name} after {retries} attempts")
 try:
+    # Locate local YOLO line detection weights
+    line_pt = 'lines.pt'
+    if not os.path.exists(line_pt):
         for f in os.listdir('.'):
             name = f.lower()
+            if 'line' in name and name.endswith('.pt'):
+                line_pt = f
                 break
+    if not os.path.exists(line_pt):
+        raise FileNotFoundError("Could not find lines.pt (or similar *.pt file containing 'line' in name)")
+    logger.info("Loading YOLO line model...")
+    line_model = YOLO(line_pt)
+    logger.info("YOLO line model loaded")
+    hf_token = os.getenv("HF_TOKEN")
+    processor = load_with_retry(TrOCRProcessor, "microsoft/trocr-base-handwritten", hf_token)
+    trocr     = load_with_retry(VisionEncoderDecoderModel, "microsoft/trocr-base-handwritten", hf_token)
+    logger.info("TrOCR loaded → ready")
 except Exception as e:
+    logger.error(f"Model loading failed: {e}", exc_info=True)
     raise
+def run_ocr(crop: Image.Image) -> str:
+    if crop.width < 20 or crop.height < 12:
+        return ""
+    pixels = processor(images=crop, return_tensors="pt").pixel_values.to(device)
+    ids = trocr.generate(pixels, max_new_tokens=128)
+    return processor.batch_decode(ids, skip_special_tokens=True)[0].strip()
+def process_document(
     image,
+    enable_debug_crops: bool = False,
+    line_imgsz: int = 768,
     conf_thresh: float = 0.25,
 ):
+    start_ts = datetime.now().strftime("%H:%M:%S")
+    logs = []
+    def log(msg: str, level: str = "INFO"):
+        line = f"[{start_ts}] {level:5} {msg}"
+        logs.append(line)
+        if level == "ERROR":
+            logger.error(msg)
+        else:
+            logger.info(msg)
+    log("Start processing")
     if image is None:
+        log("No image uploaded", "ERROR")
+        return None, "Upload an image", "\n".join(logs)
+    try:
+        # ── Prepare ─────────────────────────────────────────────────────────────
+        if not isinstance(image, Image.Image):
+            img = Image.open(image).convert("RGB")
+        else:
+            img = image.convert("RGB")
+        debug_img = img.copy()
+        draw = ImageDraw.Draw(debug_img)
+        w, h = img.size
+        log(f"Input image: {w} × {h} px")
+        debug_dir = "debug_crops"
+        if enable_debug_crops:
+            os.makedirs(debug_dir, exist_ok=True)
+            log(f"Debug crops will be saved to {debug_dir}/")
+        extracted = []
+        # ── Line detection on full image ────────────────────────────────────────
+        # Adaptive size based on image dimensions
+        max_dim = max(w, h)
+        if max_dim > 2200:
+            used_sz = 1280
+        elif max_dim > 1400:
+            used_sz = 1024
+        elif max_dim < 600:
+            used_sz = 640
+        else:
+            used_sz = line_imgsz
+        log(f"Running line detection (imgsz={used_sz}, conf≥{conf_thresh}) …")
+        res = line_model(img, conf=conf_thresh, imgsz=used_sz, verbose=False)[0]
+        boxes = res.boxes
+        log(f"→ Detected {len(boxes)} line candidate(s)")
+        if len(boxes) == 0:
+            msg = "No text lines detected"
+            log(msg, "WARNING")
+            return debug_img, msg, "\n".join(logs)
+        # Sort top → bottom
+        ys = boxes.xyxy[:, 1].cpu().numpy()   # y_min
+        order = ys.argsort()
+        for j, idx in enumerate(order, 1):
+            conf = float(boxes.conf[idx])
+            x1, y1, x2, y2 = map(round, boxes.xyxy[idx].cpu().tolist())
+            lw, lh = x2 - x1, y2 - y1
+            log(f"  Line {j}/{len(boxes)}  conf={conf:.3f}  {x1},{y1} → {x2},{y2}  ({lw}×{lh})")
+            # Skip very small detections
+            if lw < 60 or lh < 20:
+                log(f"    → skipped (too small)")
+                continue
+            draw.rectangle((x1, y1, x2, y2), outline="red", width=3)
+            line_crop = img.crop((x1, y1, x2, y2))
+            if enable_debug_crops:
+                fname = f"{debug_dir}/line_{j:02d}_conf{conf:.2f}.png"
+                line_crop.save(fname)
+            text = run_ocr(line_crop)
+            log(f"    OCR → '{text}'")
+            if text.strip():
+                extracted.append(text)
+        # ── Finalize ────────────────────────────────────────────────────────────
+        if not extracted:
+            msg = "No readable text found after OCR"
+            log(msg, "WARNING")
+            return debug_img, msg, "\n".join(logs)
+        log(f"Success — extracted {len(extracted)} line(s)")
+        if enable_debug_crops:
+            log(f"Debug crops saved to {debug_dir}/")
+        return debug_img, "\n".join(extracted), "\n".join(logs)
+    except Exception as e:
+        log(f"Processing failed: {e}", "ERROR")
+        logger.exception("Traceback:")
+        return debug_img, f"Error: {str(e)}", "\n".join(logs)
 demo = gr.Interface(
+    fn=process_document,
     inputs=[
+        gr.Image(type="pil", label="Handwritten document"),
+        gr.Checkbox(label="Save debug crops", value=False),
+        gr.Slider(512, 1280, step=64, value=768, label="Line detection size (imgsz)"),
+        gr.Slider(0.15, 0.5, step=0.05, value=0.25, label="Confidence threshold"),
     ],
     outputs=[
+        gr.Image(label="Debug (red = detected text lines)"),
+        gr.Textbox(label="Extracted Text", lines=10),
+        gr.Textbox(label="Detailed Logs (copy if alignment is wrong)", lines=16),
     ],
+    title="Handwritten Line Detection + TrOCR",
     description=(
+        "Red boxes = text lines detected by YOLO → sent to TrOCR for recognition\n\n"
+        "Use **Detailed Logs** to check coordinates, sizes & confidence values if results look off."
     ),
+    theme=gr.themes.Soft(),
+    flagging_mode="never",
 )
 if __name__ == "__main__":
+    logger.info("Launching interface…")
     demo.launch()