Spaces:

iammraat
/

test

Sleeping

App Files Files Community

iammraat commited on Jan 30

Commit

edc69a6

verified ·

1 Parent(s): 651887a

Update app.py

Browse files

Files changed (1) hide show

app.py +284 -201

app.py CHANGED Viewed

@@ -204,222 +204,305 @@
-import gradio as gr
-from ultralytics import YOLO
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-from PIL import Image, ImageDraw
-import torch
-import logging
-import os
-import warnings
-import time
-from datetime import datetime
-# ── Suppress noisy logs ──────────────────────────────────────────────────────
-os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
-warnings.filterwarnings('ignore')
-logging.getLogger('transformers').setLevel(logging.ERROR)
-logging.getLogger('ultralytics').setLevel(logging.WARNING)
-# Clean logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)-5s | %(message)s')
-logger = logging.getLogger(__name__)
-logger.info("Initializing models...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-logger.info(f"Device: {device}")
-def load_with_retry(cls, name, token=None, retries=4, delay=6):
-    for attempt in range(1, retries + 1):
-        try:
-            logger.info(f"Loading {name} (attempt {attempt}/{retries})")
-            if "Processor" in str(cls):
-                return cls.from_pretrained(name, token=token)
-            return cls.from_pretrained(name, token=token).to(device)
-        except Exception as e:
-            logger.warning(f"Load failed: {e}")
-            if attempt < retries:
-                time.sleep(delay)
-    raise RuntimeError(f"Failed to load {name} after {retries} attempts")
-try:
-    # Locate local YOLO line detection weights
-    line_pt = 'lines.pt'
-    if not os.path.exists(line_pt):
-        for f in os.listdir('.'):
-            name = f.lower()
-            if 'line' in name and name.endswith('.pt'):
-                line_pt = f
-                break
-    if not os.path.exists(line_pt):
-        raise FileNotFoundError("Could not find lines.pt (or similar *.pt file containing 'line' in name)")
-    logger.info("Loading YOLO line model...")
-    line_model = YOLO(line_pt)
-    logger.info("YOLO line model loaded")
-    hf_token = os.getenv("HF_TOKEN")
-    processor = load_with_retry(TrOCRProcessor, "microsoft/trocr-base-handwritten", hf_token)
-    trocr     = load_with_retry(VisionEncoderDecoderModel, "microsoft/trocr-base-handwritten", hf_token)
-    logger.info("TrOCR loaded → ready")
-except Exception as e:
-    logger.error(f"Model loading failed: {e}", exc_info=True)
-    raise
-def run_ocr(crop: Image.Image) -> str:
-    if crop.width < 20 or crop.height < 12:
-        return ""
-    pixels = processor(images=crop, return_tensors="pt").pixel_values.to(device)
-    ids = trocr.generate(pixels, max_new_tokens=128)
-    return processor.batch_decode(ids, skip_special_tokens=True)[0].strip()
-def process_document(
-    image,
-    enable_debug_crops: bool = False,
-    line_imgsz: int = 768,
-    conf_thresh: float = 0.25,
-):
-    start_ts = datetime.now().strftime("%H:%M:%S")
-    logs = []
-    def log(msg: str, level: str = "INFO"):
-        line = f"[{start_ts}] {level:5} {msg}"
-        logs.append(line)
-        if level == "ERROR":
-            logger.error(msg)
-        else:
-            logger.info(msg)
-    log("Start processing")
-    if image is None:
-        log("No image uploaded", "ERROR")
-        return None, "Upload an image", "\n".join(logs)
-    try:
-        # ── Prepare ─────────────────────────────────────────────────────────────
-        if not isinstance(image, Image.Image):
-            img = Image.open(image).convert("RGB")
-        else:
-            img = image.convert("RGB")
-        debug_img = img.copy()
-        draw = ImageDraw.Draw(debug_img)
-        w, h = img.size
-        log(f"Input image: {w} × {h} px")
-        debug_dir = "debug_crops"
-        if enable_debug_crops:
-            os.makedirs(debug_dir, exist_ok=True)
-            log(f"Debug crops will be saved to {debug_dir}/")
-        extracted = []
-        # ── Line detection on full image ────────────────────────────────────────
-        # Adaptive size based on image dimensions
-        max_dim = max(w, h)
-        if max_dim > 2200:
-            used_sz = 1280
-        elif max_dim > 1400:
-            used_sz = 1024
-        elif max_dim < 600:
-            used_sz = 640
-        else:
-            used_sz = line_imgsz
-        log(f"Running line detection (imgsz={used_sz}, conf≥{conf_thresh}) …")
-        res = line_model(img, conf=conf_thresh, imgsz=used_sz, verbose=False)[0]
-        boxes = res.boxes
-        log(f"→ Detected {len(boxes)} line candidate(s)")
-        if len(boxes) == 0:
-            msg = "No text lines detected"
-            log(msg, "WARNING")
-            return debug_img, msg, "\n".join(logs)
-        # Sort top → bottom
-        ys = boxes.xyxy[:, 1].cpu().numpy()   # y_min
-        order = ys.argsort()
-        for j, idx in enumerate(order, 1):
-            conf = float(boxes.conf[idx])
-            x1, y1, x2, y2 = map(round, boxes.xyxy[idx].cpu().tolist())
-            lw, lh = x2 - x1, y2 - y1
-            log(f"  Line {j}/{len(boxes)}  conf={conf:.3f}  {x1},{y1} → {x2},{y2}  ({lw}×{lh})")
-            # Skip very small detections
-            if lw < 60 or lh < 20:
-                log(f"    → skipped (too small)")
-                continue
-            draw.rectangle((x1, y1, x2, y2), outline="red", width=3)
-            line_crop = img.crop((x1, y1, x2, y2))
-            if enable_debug_crops:
-                fname = f"{debug_dir}/line_{j:02d}_conf{conf:.2f}.png"
-                line_crop.save(fname)
-            text = run_ocr(line_crop)
-            log(f"    OCR → '{text}'")
-            if text.strip():
-                extracted.append(text)
-        # ── Finalize ────────────────────────────────────────────────────────────
-        if not extracted:
-            msg = "No readable text found after OCR"
-            log(msg, "WARNING")
-            return debug_img, msg, "\n".join(logs)
-        log(f"Success — extracted {len(extracted)} line(s)")
-        if enable_debug_crops:
-            log(f"Debug crops saved to {debug_dir}/")
-        return debug_img, "\n".join(extracted), "\n".join(logs)
-    except Exception as e:
-        log(f"Processing failed: {e}", "ERROR")
-        logger.exception("Traceback:")
-        return debug_img, f"Error: {str(e)}", "\n".join(logs)
-demo = gr.Interface(
-    fn=process_document,
-    inputs=[
-        gr.Image(type="pil", label="Handwritten document"),
-        gr.Checkbox(label="Save debug crops", value=False),
-        gr.Slider(512, 1280, step=64, value=768, label="Line detection size (imgsz)"),
-        gr.Slider(0.15, 0.5, step=0.05, value=0.25, label="Confidence threshold"),
-    ],
-    outputs=[
-        gr.Image(label="Debug (red = detected text lines)"),
-        gr.Textbox(label="Extracted Text", lines=10),
-        gr.Textbox(label="Detailed Logs (copy if alignment is wrong)", lines=16),
-    ],
-    title="Handwritten Line Detection + TrOCR",
-    description=(
-        "Red boxes = text lines detected by YOLO → sent to TrOCR for recognition\n\n"
-        "Use **Detailed Logs** to check coordinates, sizes & confidence values if results look off."
-    ),
-    theme=gr.themes.Soft(),
-    flagging_mode="never",
-)
-if __name__ == "__main__":
-    logger.info("Launching interface…")
-    demo.launch()

+# import gradio as gr
+# from ultralytics import YOLO
+# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+# from PIL import Image, ImageDraw
+# import torch
+# import logging
+# import os
+# import warnings
+# import time
+# from datetime import datetime
+# # ── Suppress noisy logs ──────────────────────────────────────────────────────
+# os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
+# warnings.filterwarnings('ignore')
+# logging.getLogger('transformers').setLevel(logging.ERROR)
+# logging.getLogger('ultralytics').setLevel(logging.WARNING)
+# # Clean logging
+# logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)-5s | %(message)s')
+# logger = logging.getLogger(__name__)
+# logger.info("Initializing models...")
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# logger.info(f"Device: {device}")
+# def load_with_retry(cls, name, token=None, retries=4, delay=6):
+#     for attempt in range(1, retries + 1):
+#         try:
+#             logger.info(f"Loading {name} (attempt {attempt}/{retries})")
+#             if "Processor" in str(cls):
+#                 return cls.from_pretrained(name, token=token)
+#             return cls.from_pretrained(name, token=token).to(device)
+#         except Exception as e:
+#             logger.warning(f"Load failed: {e}")
+#             if attempt < retries:
+#                 time.sleep(delay)
+#     raise RuntimeError(f"Failed to load {name} after {retries} attempts")
+# try:
+#     # Locate local YOLO line detection weights
+#     line_pt = 'lines.pt'
+#     if not os.path.exists(line_pt):
+#         for f in os.listdir('.'):
+#             name = f.lower()
+#             if 'line' in name and name.endswith('.pt'):
+#                 line_pt = f
+#                 break
+#     if not os.path.exists(line_pt):
+#         raise FileNotFoundError("Could not find lines.pt (or similar *.pt file containing 'line' in name)")
+#     logger.info("Loading YOLO line model...")
+#     line_model = YOLO(line_pt)
+#     logger.info("YOLO line model loaded")
+#     hf_token = os.getenv("HF_TOKEN")
+#     processor = load_with_retry(TrOCRProcessor, "microsoft/trocr-base-handwritten", hf_token)
+#     trocr     = load_with_retry(VisionEncoderDecoderModel, "microsoft/trocr-base-handwritten", hf_token)
+#     logger.info("TrOCR loaded → ready")
+# except Exception as e:
+#     logger.error(f"Model loading failed: {e}", exc_info=True)
+#     raise
+# def run_ocr(crop: Image.Image) -> str:
+#     if crop.width < 20 or crop.height < 12:
+#         return ""
+#     pixels = processor(images=crop, return_tensors="pt").pixel_values.to(device)
+#     ids = trocr.generate(pixels, max_new_tokens=128)
+#     return processor.batch_decode(ids, skip_special_tokens=True)[0].strip()
+# def process_document(
+#     image,
+#     enable_debug_crops: bool = False,
+#     line_imgsz: int = 768,
+#     conf_thresh: float = 0.25,
+# ):
+#     start_ts = datetime.now().strftime("%H:%M:%S")
+#     logs = []
+#     def log(msg: str, level: str = "INFO"):
+#         line = f"[{start_ts}] {level:5} {msg}"
+#         logs.append(line)
+#         if level == "ERROR":
+#             logger.error(msg)
+#         else:
+#             logger.info(msg)
+#     log("Start processing")
+#     if image is None:
+#         log("No image uploaded", "ERROR")
+#         return None, "Upload an image", "\n".join(logs)
+#     try:
+#         # ── Prepare ─────────────────────────────────────────────────────────────
+#         if not isinstance(image, Image.Image):
+#             img = Image.open(image).convert("RGB")
+#         else:
+#             img = image.convert("RGB")
+#         debug_img = img.copy()
+#         draw = ImageDraw.Draw(debug_img)
+#         w, h = img.size
+#         log(f"Input image: {w} × {h} px")
+#         debug_dir = "debug_crops"
+#         if enable_debug_crops:
+#             os.makedirs(debug_dir, exist_ok=True)
+#             log(f"Debug crops will be saved to {debug_dir}/")
+#         extracted = []
+#         # ── Line detection on full image ────────────────────────────────────────
+#         # Adaptive size based on image dimensions
+#         max_dim = max(w, h)
+#         if max_dim > 2200:
+#             used_sz = 1280
+#         elif max_dim > 1400:
+#             used_sz = 1024
+#         elif max_dim < 600:
+#             used_sz = 640
+#         else:
+#             used_sz = line_imgsz
+#         log(f"Running line detection (imgsz={used_sz}, conf≥{conf_thresh}) …")
+#         res = line_model(img, conf=conf_thresh, imgsz=used_sz, verbose=False)[0]
+#         boxes = res.boxes
+#         log(f"→ Detected {len(boxes)} line candidate(s)")
+#         if len(boxes) == 0:
+#             msg = "No text lines detected"
+#             log(msg, "WARNING")
+#             return debug_img, msg, "\n".join(logs)
+#         # Sort top → bottom
+#         ys = boxes.xyxy[:, 1].cpu().numpy()   # y_min
+#         order = ys.argsort()
+#         for j, idx in enumerate(order, 1):
+#             conf = float(boxes.conf[idx])
+#             x1, y1, x2, y2 = map(round, boxes.xyxy[idx].cpu().tolist())
+#             lw, lh = x2 - x1, y2 - y1
+#             log(f"  Line {j}/{len(boxes)}  conf={conf:.3f}  {x1},{y1} → {x2},{y2}  ({lw}×{lh})")
+#             # Skip very small detections
+#             if lw < 60 or lh < 20:
+#                 log(f"    → skipped (too small)")
+#                 continue
+#             draw.rectangle((x1, y1, x2, y2), outline="red", width=3)
+#             line_crop = img.crop((x1, y1, x2, y2))
+#             if enable_debug_crops:
+#                 fname = f"{debug_dir}/line_{j:02d}_conf{conf:.2f}.png"
+#                 line_crop.save(fname)
+#             text = run_ocr(line_crop)
+#             log(f"    OCR → '{text}'")
+#             if text.strip():
+#                 extracted.append(text)
+#         # ── Finalize ────────────────────────────────────────────────────────────
+#         if not extracted:
+#             msg = "No readable text found after OCR"
+#             log(msg, "WARNING")
+#             return debug_img, msg, "\n".join(logs)
+#         log(f"Success — extracted {len(extracted)} line(s)")
+#         if enable_debug_crops:
+#             log(f"Debug crops saved to {debug_dir}/")
+#         return debug_img, "\n".join(extracted), "\n".join(logs)
+#     except Exception as e:
+#         log(f"Processing failed: {e}", "ERROR")
+#         logger.exception("Traceback:")
+#         return debug_img, f"Error: {str(e)}", "\n".join(logs)
+# demo = gr.Interface(
+#     fn=process_document,
+#     inputs=[
+#         gr.Image(type="pil", label="Handwritten document"),
+#         gr.Checkbox(label="Save debug crops", value=False),
+#         gr.Slider(512, 1280, step=64, value=768, label="Line detection size (imgsz)"),
+#         gr.Slider(0.15, 0.5, step=0.05, value=0.25, label="Confidence threshold"),
+#     ],
+#     outputs=[
+#         gr.Image(label="Debug (red = detected text lines)"),
+#         gr.Textbox(label="Extracted Text", lines=10),
+#         gr.Textbox(label="Detailed Logs (copy if alignment is wrong)", lines=16),
+#     ],
+#     title="Handwritten Line Detection + TrOCR",
+#     description=(
+#         "Red boxes = text lines detected by YOLO → sent to TrOCR for recognition\n\n"
+#         "Use **Detailed Logs** to check coordinates, sizes & confidence values if results look off."
+#     ),
+#     theme=gr.themes.Soft(),
+#     flagging_mode="never",
+# )
+# if __name__ == "__main__":
+#     logger.info("Launching interface…")
+#     demo.launch()
+# app.py for Hugging Face Space
+# This script creates a Gradio demo for HTR using Riksarkivet YOLO models for region and line detection,
+# and Microsoft's TrOCR for text recognition.
+import gradio as gr
+from ultralytics import YOLO
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from PIL import Image
+import torch
+# Load models (this will download from HF if not cached)
+region_model = YOLO("Riksarkivet/yolov9-regions-1")
+line_model = YOLO("Riksarkivet/yolov9-lines-within-regions-1")
+trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
+trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
+def process_image(image):
+    # Step 1: Detect text regions
+    region_results = region_model(image)
+    texts = []
+    if not region_results or not region_results[0].boxes:
+        return "No regions detected."
+    for region in region_results[0].boxes:
+        # Extract bounding box (x1, y1, x2, y2)
+        x1, y1, x2, y2 = map(int, region.xyxy[0])
+        region_crop = image.crop((x1, y1, x2, y2))
+        # Step 2: Detect lines within the region
+        line_results = line_model(region_crop)
+        if not line_results or not line_results[0].boxes:
+            texts.append("No lines detected in this region.")
+            continue
+        region_texts = []
+        for line in sorted(line_results[0].boxes, key=lambda b: b.xyxy[0][1]):  # Sort by y-coordinate (top to bottom)
+            lx1, ly1, lx2, ly2 = map(int, line.xyxy[0])
+            line_crop = region_crop.crop((lx1, ly1, lx2, ly2))
+            # Step 3: Recognize text with TrOCR
+            pixel_values = trocr_processor(images=line_crop, return_tensors="pt").pixel_values
+            generated_ids = trocr_model.generate(pixel_values)
+            text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            region_texts.append(text)
+        texts.append(" ".join(region_texts))  # Join lines in region with space or \n as needed
+    return "\n\n".join(texts)  # Separate regions with double newline
+# Gradio interface
+demo = gr.Interface(
+    fn=process_image,
+    inputs=gr.Image(type="pil"),
+    outputs="text",
+    title="HTR Demo with YOLO Detection and TrOCR Recognition",
+    description="Upload an image of a handwritten document. The app will detect regions, then lines, and recognize text using Microsoft's TrOCR."
+)
+if __name__ == "__main__":
+    demo.launch()