Spaces:

AzizMiladi
/

FiberGate

Sleeping

File size: 25,936 Bytes

"""
ocr_rasterise.py
────────────────
OCR + rasterisation pipeline for GuichetOI_ML dataset.

Directory layout expected:
    DataRef/
        Autorisation/
        Certificat/
        fiche/
        Mandat/
        PlanMasse/
        PlanSituation/

Output layout produced:
    processed_dataref/
        Autorisation/
            images/      ← PNG page images  (200 DPI)
            ocr/         ← per-page JSON    (tokens + bboxes + full text)
        Certificat/  ...
        fiche/       ...
        Mandat/      ...
        PlanMasse/   ...
        PlanSituation/ ...

    label_studio_tasks.json  ← ready-to-import Label Studio task list

Usage:
    python ocr_rasterise.py                    # uses default paths below
    python ocr_rasterise.py --dataset_dir ./DataRef --output_dir ./processed_dataref
"""

import argparse
import json
import logging
import re
import sys
import unicodedata
from pathlib import Path
from typing import Optional

# ── Third-party ──────────────────────────────────────────────────────────────
try:
    from pdf2image import convert_from_path
    from pdf2image.exceptions import PDFPageCountError
except ImportError:
    sys.exit("pip install pdf2image")

try:
    import pytesseract
    from pytesseract import Output
except ImportError:
    sys.exit("pip install pytesseract")

try:
    from PIL import Image
except ImportError:
    sys.exit("pip install Pillow")

try:
    import cv2
    import numpy as np
except ImportError:
    sys.exit("pip install opencv-python numpy")

# ── Logging ──────────────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)

# ─────────────────────────────────────────────────────────────────────────────
# CONFIGURATION
# ─────────────────────────────────────────────────────────────────────────────

DATASET_FOLDERS: dict[str, str] = {
    "Autorisation":  "Autorisation",
    "Certificat":    "Certificat",
    "fiche":         "fiche",
    "Mandat":        "Mandat",
    "PlanMasse":     "PlanMasse",
    "PlanSituation": "PlanSituation",
}

# Pattern matching for flat directory structures (e.g., DataSet2)
# Order matters: more specific patterns first, to avoid overlapping matches
LABEL_PATTERNS: dict[str, str] = {
    "Mandat": r"\bmandat\b",
    "Certificat": r"(certificat[- ]?d[- ]?adressage|certificat[- ]?adr|adr(?:essage)?)",
    "PlanMasse": r"plan[- ]?(?:de[- ])?masse",
    "PlanSituation": r"plan[- ]?(?:de[- ])?situation|situation",
    "fiche": r"fiche[- ]?(?:de[- ])?renseignement|renseignement",
    "Autorisation": r"(auto[- ]?urbanisme|arrete[- ]?pc|autorisation)",
}

OCR_LANG    = "fra"
RASTER_DPI  = 200
BBOX_NORM   = 1000
MIN_CONF    = 30
SUPPORTED_EXT = {".pdf", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}


# ─────────────────────────────────────────────────────────────────────────────
# IMAGE PRE-PROCESSING
# ─────────────────────────────────────────────────────────────────────────────

def preprocess_image(pil_img: Image.Image) -> Image.Image:
    """
    RGB PIL image → clean greyscale ready for Tesseract.

    Pipeline
    ────────
    1. Convert to greyscale
    2. Upscale short images to ≥ 2000 px (improves OCR on small print)
    3. Deskew via Hough-line angle detection
    4. Adaptive binarisation (handles uneven lighting / scan shadows)
    5. Morphological noise removal
    6. Unsharp-mask sharpening
    """
    img = pil_img.convert("L")

    # 1. Upscale if too small
    w, h = img.size
    long_side = max(w, h)
    if long_side < 2000:
        scale = 2000 / long_side
        img = img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)

    arr = np.array(img, dtype=np.uint8)

    # 2. Deskew
    arr = _deskew(arr)

    # 3. Adaptive binarisation
    binary = cv2.adaptiveThreshold(
        arr, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        blockSize=51,
        C=10,
    )

    # 4. Remove isolated noise pixels
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    # 5. Unsharp-mask sharpening
    blurred   = cv2.GaussianBlur(binary, (0, 0), sigmaX=1.5)
    sharpened = cv2.addWeighted(binary, 1.8, blurred, -0.8, 0)

    return Image.fromarray(sharpened)


def _deskew(arr: np.ndarray) -> np.ndarray:
    """Estimate and correct skew using Hough-line voting."""
    try:
        edges = cv2.Canny(arr, 50, 150, apertureSize=3)
        lines = cv2.HoughLines(edges, 1, np.pi / 180, threshold=200)
        if lines is None or len(lines) < 5:
            return arr

        angles = []
        for rho, theta in lines[:, 0]:
            angle_deg = np.degrees(theta) - 90
            if abs(angle_deg) < 10:
                angles.append(angle_deg)

        if not angles:
            return arr

        median_angle = float(np.median(angles))
        if abs(median_angle) < 0.3:
            return arr

        h, w = arr.shape
        M = cv2.getRotationMatrix2D((w / 2, h / 2), median_angle, 1.0)
        rotated = cv2.warpAffine(
            arr, M, (w, h),
            flags=cv2.INTER_LINEAR,
            borderMode=cv2.BORDER_REPLICATE,
        )
        log.debug("Deskewed %.2f°", median_angle)
        return rotated
    except Exception as exc:
        log.debug("Deskew skipped: %s", exc)
        return arr


# ─────────────────────────────────────────────────────────────────────────────
# PDF → IMAGES
# ─────────────────────────────────────────────────────────────────────────────

def pdf_to_images(pdf_path: Path, dpi: int = RASTER_DPI) -> list[Image.Image]:
    """Rasterise every PDF page at `dpi` DPI → list of RGB PIL images."""
    try:
        pages = convert_from_path(str(pdf_path), dpi=dpi, fmt="png", thread_count=2)
        log.info("  Rasterised %d page(s) from %s", len(pages), pdf_path.name)
        return [p.convert("RGB") for p in pages]
    except PDFPageCountError:
        log.warning("  Empty PDF: %s", pdf_path.name)
        return []
    except Exception as exc:
        log.error("  pdf_to_images failed for %s: %s", pdf_path.name, exc)
        return []


# ─────────────────────────────────────────────────────────────────────────────
# OCR
# ─────────────────────────────────────────────────────────────────────────────

def run_ocr(pil_img: Image.Image, lang: str = OCR_LANG) -> dict:
    """
    Run Tesseract on a PIL image and return a structured result dict:

        words        – list of token strings
        bboxes       – pixel [x0, y0, x1, y1] per token
        bboxes_norm  – bboxes normalised to [0, 1000] for LayoutLMv3
        confs        – Tesseract confidence per token (0–100)
        full_text    – raw OCR string (whole page)
        width/height – image dimensions in pixels
    """
    config = "--oem 1 --psm 6"
    w, h   = pil_img.size

    data = pytesseract.image_to_data(
        pil_img, lang=lang, config=config, output_type=Output.DICT
    )

    words, bboxes, bboxes_norm, confs = [], [], [], []

    for i in range(len(data["text"])):
        word = data["text"][i].strip()
        conf = int(data["conf"][i])

        if not word or conf < MIN_CONF:
            continue

        x0 = max(0, data["left"][i])
        y0 = max(0, data["top"][i])
        x1 = min(w, x0 + data["width"][i])
        y1 = min(h, y0 + data["height"][i])

        if x1 <= x0 or y1 <= y0:
            continue

        words.append(word)
        bboxes.append([x0, y0, x1, y1])
        bboxes_norm.append([
            int(x0 / w * BBOX_NORM),
            int(y0 / h * BBOX_NORM),
            int(x1 / w * BBOX_NORM),
            int(y1 / h * BBOX_NORM),
        ])
        confs.append(conf)

    full_text = pytesseract.image_to_string(pil_img, lang=lang, config=config)

    return {
        "words":       words,
        "bboxes":      bboxes,
        "bboxes_norm": bboxes_norm,
        "confs":       confs,
        "full_text":   full_text.strip(),
        "width":       w,
        "height":      h,
    }


# ─────────────────────────────────────────────────────────────────────────────
# LABEL STUDIO TASK BUILDER  (fixed)
# ─────────────────────────────────────────────────────────────────────────────

def build_label_studio_task(
    image_path: Path,
    ocr_result: dict,
    doc_class: str,
    relative_image_url: Optional[str] = None,
) -> dict:
    """
    Build one Label Studio task compatible with the official OCR template.

    FIX — Label Studio's OCR template validates that task["data"] contains
    exactly two mandatory keys:
        "image"  →  URL/path of the page PNG to display
        "ocr"    →  the raw OCR text string (bound to the Text area widget)

    Any other keys inside "data" are allowed as metadata but those two MUST
    be present or LS throws:
        'ValidationError: "ocr" key is expected in task data'

    Pre-annotations (one rectangle + transcription per OCR token) are stored
    in "predictions" so annotators see boxes already drawn and only need to
    click a label — they do not redraw boxes by hand.
    """
    url = f"file:///{image_path.resolve().as_posix()}"
    w, h = ocr_result["width"], ocr_result["height"]

    results = []
    for idx, (word, (x0, y0, x1, y1)) in enumerate(
        zip(ocr_result["words"], ocr_result["bboxes"])
    ):
        # Convert pixel bbox → Label Studio percentage format
        # LS uses: x, y = top-left corner (%); width, height = size (%)
        x_pct = round(x0 / w * 100, 4)
        y_pct = round(y0 / h * 100, 4)
        w_pct = round((x1 - x0) / w * 100, 4)
        h_pct = round((y1 - y0) / h * 100, 4)

        region_id = f"r{idx}"

        # ── 1. Rectangle bounding box ─────────────────────────────────────────
        results.append({
            "id":        region_id,
            "from_name": "bbox",
            "to_name":   "image",
            "type":      "rectangle",
            "value": {
                "x": x_pct, "y": y_pct,
                "width": w_pct, "height": h_pct,
                "rotation": 0,
            },
        })

        # ── 2. Transcription text (shows the OCR word inside the box) ─────────
        results.append({
            "id":        f"t{idx}",
            "from_name": "transcription",
            "to_name":   "image",
            "type":      "textarea",
            "parent_id": region_id,
            "value": {
                "x": x_pct, "y": y_pct,
                "width": w_pct, "height": h_pct,
                "rotation": 0,
                "text": [word],
            },
        })

        # ── 3. Empty label slot — annotator picks the entity label ────────────
        results.append({
            "id":        f"l{idx}",
            "from_name": "label",
            "to_name":   "image",
            "type":      "rectanglelabels",
            "parent_id": region_id,
            "value": {
                "x": x_pct, "y": y_pct,
                "width": w_pct, "height": h_pct,
                "rotation": 0,
                "rectanglelabels": [],   # filled by annotator
            },
        })

    return {
        "data": {
            # ── REQUIRED by Label Studio OCR template ─────────────────────────
            "image": url,                        # displayed page image
            "ocr":   ocr_result["full_text"],    # ← was missing → caused the error
            # ── Extra metadata (ignored by LS UI, useful downstream) ──────────
            "doc_class":  doc_class,
            "image_file": image_path.name,
        },
        "annotations": [],
        "predictions": [{"result": results, "score": 0.0}],
    }


# ─────────────────────────────────────────────────────────────────────────────
# MAIN PIPELINE
# ─────────────────────────────────────────────────────────────────────────────

def process_document(
    src_path: Path,
    img_dir: Path,
    ocr_dir: Path,
    doc_class: str,
    ls_tasks: list,
    stem: str,
) -> int:
    """Process one source file (PDF or image). Returns pages processed."""
    ext = src_path.suffix.lower()

    if ext == ".pdf":
        pages = pdf_to_images(src_path, dpi=RASTER_DPI)
    elif ext in SUPPORTED_EXT:
        try:
            pages = [Image.open(src_path).convert("RGB")]
        except Exception as exc:
            log.error("  Cannot open %s: %s", src_path.name, exc)
            return 0
    else:
        log.warning("  Unsupported type: %s", src_path.name)
        return 0

    processed = 0
    for page_idx, page_rgb in enumerate(pages):
        page_stem = f"{stem}_p{page_idx:03d}"

        # Save raw rasterised PNG (original colours, useful for inspection)
        raw_path = img_dir / f"{page_stem}_raw.png"
        page_rgb.save(raw_path, "PNG")

        # Pre-process then save the clean version (used for OCR + LS display)
        page_proc  = preprocess_image(page_rgb)
        proc_path  = img_dir / f"{page_stem}.png"
        page_proc.save(proc_path, "PNG")

        # Run OCR
        ocr = run_ocr(page_proc, lang=OCR_LANG)
        log.info(
            "    Page %d → %d tokens | %d chars",
            page_idx, len(ocr["words"]), len(ocr["full_text"]),
        )

        # Save per-page OCR JSON (used later during dataset preparation)
        ocr_payload = {
            "source_file": src_path.name,
            "doc_class":   doc_class,
            "page_index":  page_idx,
            "image_file":  proc_path.name,
            **ocr,
        }
        (ocr_dir / f"{page_stem}.json").write_text(
            json.dumps(ocr_payload, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )

        # Build & collect Label Studio task
        ls_tasks.append(build_label_studio_task(
            image_path=proc_path,
            ocr_result=ocr,
            doc_class=doc_class,
        ))

        processed += 1

    return processed


def run_pipeline(dataset_dir: Path, output_dir: Path) -> None:
    """
    Iterate dataset and process all documents.
    Supports two structures:
    1. Organized: DataSet_Autorisation/, DataSet_Certificat/, etc.
    2. Flat: All files in root with pattern-based classification (DataSet2)
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    ls_tasks: list[dict] = []
    summary:  dict[str, dict] = {}

    # Check if dataset uses organized or flat structure
    is_organized = any(
        (dataset_dir / folder_name).exists()
        for folder_name in DATASET_FOLDERS.keys()
    )

    if is_organized:
        # ── Organized structure: DataSet_* subdirectories ──────────────────────
        for folder_name, doc_class in DATASET_FOLDERS.items():
            folder_path = dataset_dir / folder_name
            if not folder_path.exists():
                log.warning("Folder not found, skipping: %s", folder_path)
                continue

            img_dir = output_dir / doc_class / "images"
            ocr_dir = output_dir / doc_class / "ocr"
            img_dir.mkdir(parents=True, exist_ok=True)
            ocr_dir.mkdir(parents=True, exist_ok=True)

            log.info("━━━ %s (%s) ━━━", doc_class, folder_name)

            files = sorted(
                f for f in folder_path.iterdir()
                if f.suffix.lower() in SUPPORTED_EXT
            )

            if not files:
                log.warning("  No supported files in %s", folder_path)
                continue

            total_pages = 0
            for src_file in files:
                log.info("  Processing: %s", src_file.name)
                n = process_document(
                    src_path=src_file,
                    img_dir=img_dir,
                    ocr_dir=ocr_dir,
                    doc_class=doc_class,
                    ls_tasks=ls_tasks,
                    stem=_safe_stem(src_file.stem),
                )
                total_pages += n

            summary[doc_class] = {"files": len(files), "pages": total_pages}
            log.info("  → %d file(s), %d page(s)", len(files), total_pages)

    else:
        # ── Flat structure: Files at root, classified by pattern ──────────────
        log.info("━━━ Flat dataset structure (pattern-based classification) ━━━")
        
        files = sorted(
            f for f in dataset_dir.iterdir()
            if f.is_file() and f.suffix.lower() in SUPPORTED_EXT
        )

        if not files:
            log.warning("  No supported files in %s", dataset_dir)
        else:
            # Group files by classification
            classified: dict[str, list[Path]] = {doc_class: [] for doc_class in LABEL_PATTERNS.keys()}
            classified["_unclassified"] = []

            for src_file in files:
                doc_class = _classify_file(src_file.name)
                if doc_class:
                    classified[doc_class].append(src_file)
                else:
                    classified["_unclassified"].append(src_file)

            # Process each class
            for doc_class, class_files in classified.items():
                if not class_files:
                    continue

                # Skip unclassified for now (can be logged separately if needed)
                if doc_class == "_unclassified":
                    if class_files:
                        log.warning("  Unclassified (%d files): %s", 
                                   len(class_files), 
                                   ", ".join(f.name for f in class_files[:3]))
                    continue

                img_dir = output_dir / doc_class / "images"
                ocr_dir = output_dir / doc_class / "ocr"
                img_dir.mkdir(parents=True, exist_ok=True)
                ocr_dir.mkdir(parents=True, exist_ok=True)

                log.info("  %s (%d files)", doc_class, len(class_files))

                total_pages = 0
                for src_file in class_files:
                    log.info("    Processing: %s", src_file.name)
                    n = process_document(
                        src_path=src_file,
                        img_dir=img_dir,
                        ocr_dir=ocr_dir,
                        doc_class=doc_class,
                        ls_tasks=ls_tasks,
                        stem=_safe_stem(src_file.stem),
                    )
                    total_pages += n

                summary[doc_class] = {"files": len(class_files), "pages": total_pages}
                log.info("    → %d page(s)", total_pages)

    # Write Label Studio import file
    ls_path = output_dir / "label_studio_tasks.json"
    ls_path.write_text(
        json.dumps(ls_tasks, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    log.info("Label Studio tasks → %s  (%d tasks)", ls_path, len(ls_tasks))

    # Print summary table
    print("\n" + "═" * 50)
    print(f"  {'Class':<22} {'Files':>6}  {'Pages':>6}")
    print("─" * 50)
    total_f = total_p = 0
    for cls, s in summary.items():
        print(f"  {cls:<22} {s['files']:>6}  {s['pages']:>6}")
        total_f += s["files"]
        total_p += s["pages"]
    print("─" * 50)
    print(f"  {'TOTAL':<22} {total_f:>6}  {total_p:>6}")
    print("═" * 50 + "\n")


# ─────────────────────────────────────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────────────────────────────────────

def _safe_stem(name: str) -> str:
    """Normalise a filename stem to ASCII-safe, space-free form."""
    nfkd = unicodedata.normalize("NFKD", name)
    ascii_str = nfkd.encode("ascii", "ignore").decode("ascii")
    return re.sub(r"[^\w\-]", "_", ascii_str)


def _classify_file(filename: str) -> Optional[str]:
    """Classify a file by filename pattern matching. Returns doc_class or None."""
    filename_lower = filename.lower()
    for doc_class, pattern in LABEL_PATTERNS.items():
        if re.search(pattern, filename_lower):
            return doc_class
    return None


def validate_classification(dataset_dir: Path) -> None:
    """Test and display classification results without processing files."""
    files = sorted(
        f for f in dataset_dir.iterdir()
        if f.is_file() and f.suffix.lower() in SUPPORTED_EXT
    )

    if not files:
        log.warning("No supported files in %s", dataset_dir)
        return

    classified: dict[str, list[str]] = {doc_class: [] for doc_class in LABEL_PATTERNS.keys()}
    classified["_unclassified"] = []

    for src_file in files:
        doc_class = _classify_file(src_file.name)
        if doc_class:
            classified[doc_class].append(src_file.name)
        else:
            classified["_unclassified"].append(src_file.name)

    # Print results
    print("\n" + "═" * 70)
    print(f"  CLASSIFICATION VALIDATION  ({len(files)} files)")
    print("═" * 70)
    
    total = 0
    for doc_class in list(LABEL_PATTERNS.keys()) + ["_unclassified"]:
        files_in_class = classified[doc_class]
        if files_in_class:
            display_class = "UNCLASSIFIED" if doc_class == "_unclassified" else doc_class
            print(f"\n  {display_class}  ({len(files_in_class)} files)")
            print("  " + "─" * 66)
            for fname in files_in_class[:10]:  # Show first 10
                print(f"    • {fname}")
            if len(files_in_class) > 10:
                print(f"    ... and {len(files_in_class) - 10} more")
            total += len(files_in_class)
    
    print("\n" + "═" * 70 + "\n")


# ─────────────────────────────────────────────────────────────────────────────
# CLI
# ─────────────────────────────────────────────────────────────────────────────

def _parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Rasterise + OCR for GuichetOI_ML")
    p.add_argument("--dataset_dir", type=Path, default=Path("DataRef"))
    p.add_argument("--output_dir",  type=Path, default=Path("processed_dataref"))
    p.add_argument("--dpi",      type=int, default=RASTER_DPI)
    p.add_argument("--lang",     type=str, default=OCR_LANG)
    p.add_argument("--min_conf", type=int, default=MIN_CONF)
    p.add_argument("--validate", action="store_true", help="Only validate classification, don't process files")
    return p.parse_args()


if __name__ == "__main__":
    args = _parse_args()
    RASTER_DPI = args.dpi
    OCR_LANG   = args.lang
    MIN_CONF   = args.min_conf

    log.info("Dataset : %s", args.dataset_dir.resolve())
    log.info("Output  : %s", args.output_dir.resolve())
    log.info("DPI=%d  lang=%s  min_conf=%d", RASTER_DPI, OCR_LANG, MIN_CONF)

    if args.validate:
        log.info("Running classification validation (no files will be processed)")
        validate_classification(dataset_dir=args.dataset_dir)
    else:
        run_pipeline(dataset_dir=args.dataset_dir, output_dir=args.output_dir)