File size: 25,936 Bytes
f4c0357
 
 
 
 
 
33ddb61
 
 
 
 
 
 
f4c0357
 
33ddb61
f4c0357
 
 
 
 
 
 
 
 
 
 
 
 
33ddb61
f4c0357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33ddb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4c0357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33ddb61
 
 
 
 
 
f4c0357
 
 
 
33ddb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4c0357
33ddb61
f4c0357
33ddb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4c0357
33ddb61
 
 
 
f4c0357
33ddb61
 
f4c0357
 
 
33ddb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4c0357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33ddb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4c0357
 
 
 
 
 
33ddb61
 
f4c0357
 
 
33ddb61
f4c0357
 
 
 
 
 
 
 
 
 
 
 
 
33ddb61
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
"""
ocr_rasterise.py
────────────────
OCR + rasterisation pipeline for GuichetOI_ML dataset.

Directory layout expected:
    DataRef/
        Autorisation/
        Certificat/
        fiche/
        Mandat/
        PlanMasse/
        PlanSituation/

Output layout produced:
    processed_dataref/
        Autorisation/
            images/      ← PNG page images  (200 DPI)
            ocr/         ← per-page JSON    (tokens + bboxes + full text)
        Certificat/  ...
        fiche/       ...
        Mandat/      ...
        PlanMasse/   ...
        PlanSituation/ ...

    label_studio_tasks.json  ← ready-to-import Label Studio task list

Usage:
    python ocr_rasterise.py                    # uses default paths below
    python ocr_rasterise.py --dataset_dir ./DataRef --output_dir ./processed_dataref
"""

import argparse
import json
import logging
import re
import sys
import unicodedata
from pathlib import Path
from typing import Optional

# ── Third-party ──────────────────────────────────────────────────────────────
try:
    from pdf2image import convert_from_path
    from pdf2image.exceptions import PDFPageCountError
except ImportError:
    sys.exit("pip install pdf2image")

try:
    import pytesseract
    from pytesseract import Output
except ImportError:
    sys.exit("pip install pytesseract")

try:
    from PIL import Image
except ImportError:
    sys.exit("pip install Pillow")

try:
    import cv2
    import numpy as np
except ImportError:
    sys.exit("pip install opencv-python numpy")

# ── Logging ──────────────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)

# ─────────────────────────────────────────────────────────────────────────────
# CONFIGURATION
# ─────────────────────────────────────────────────────────────────────────────

DATASET_FOLDERS: dict[str, str] = {
    "Autorisation":  "Autorisation",
    "Certificat":    "Certificat",
    "fiche":         "fiche",
    "Mandat":        "Mandat",
    "PlanMasse":     "PlanMasse",
    "PlanSituation": "PlanSituation",
}

# Pattern matching for flat directory structures (e.g., DataSet2)
# Order matters: more specific patterns first, to avoid overlapping matches
LABEL_PATTERNS: dict[str, str] = {
    "Mandat": r"\bmandat\b",
    "Certificat": r"(certificat[- ]?d[- ]?adressage|certificat[- ]?adr|adr(?:essage)?)",
    "PlanMasse": r"plan[- ]?(?:de[- ])?masse",
    "PlanSituation": r"plan[- ]?(?:de[- ])?situation|situation",
    "fiche": r"fiche[- ]?(?:de[- ])?renseignement|renseignement",
    "Autorisation": r"(auto[- ]?urbanisme|arrete[- ]?pc|autorisation)",
}

OCR_LANG    = "fra"
RASTER_DPI  = 200
BBOX_NORM   = 1000
MIN_CONF    = 30
SUPPORTED_EXT = {".pdf", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}


# ─────────────────────────────────────────────────────────────────────────────
# IMAGE PRE-PROCESSING
# ─────────────────────────────────────────────────────────────────────────────

def preprocess_image(pil_img: Image.Image) -> Image.Image:
    """
    RGB PIL image β†’ clean greyscale ready for Tesseract.

    Pipeline
    ────────
    1. Convert to greyscale
    2. Upscale short images to β‰₯ 2000 px (improves OCR on small print)
    3. Deskew via Hough-line angle detection
    4. Adaptive binarisation (handles uneven lighting / scan shadows)
    5. Morphological noise removal
    6. Unsharp-mask sharpening
    """
    img = pil_img.convert("L")

    # 1. Upscale if too small
    w, h = img.size
    long_side = max(w, h)
    if long_side < 2000:
        scale = 2000 / long_side
        img = img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)

    arr = np.array(img, dtype=np.uint8)

    # 2. Deskew
    arr = _deskew(arr)

    # 3. Adaptive binarisation
    binary = cv2.adaptiveThreshold(
        arr, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        blockSize=51,
        C=10,
    )

    # 4. Remove isolated noise pixels
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    # 5. Unsharp-mask sharpening
    blurred   = cv2.GaussianBlur(binary, (0, 0), sigmaX=1.5)
    sharpened = cv2.addWeighted(binary, 1.8, blurred, -0.8, 0)

    return Image.fromarray(sharpened)


def _deskew(arr: np.ndarray) -> np.ndarray:
    """Estimate and correct skew using Hough-line voting."""
    try:
        edges = cv2.Canny(arr, 50, 150, apertureSize=3)
        lines = cv2.HoughLines(edges, 1, np.pi / 180, threshold=200)
        if lines is None or len(lines) < 5:
            return arr

        angles = []
        for rho, theta in lines[:, 0]:
            angle_deg = np.degrees(theta) - 90
            if abs(angle_deg) < 10:
                angles.append(angle_deg)

        if not angles:
            return arr

        median_angle = float(np.median(angles))
        if abs(median_angle) < 0.3:
            return arr

        h, w = arr.shape
        M = cv2.getRotationMatrix2D((w / 2, h / 2), median_angle, 1.0)
        rotated = cv2.warpAffine(
            arr, M, (w, h),
            flags=cv2.INTER_LINEAR,
            borderMode=cv2.BORDER_REPLICATE,
        )
        log.debug("Deskewed %.2fΒ°", median_angle)
        return rotated
    except Exception as exc:
        log.debug("Deskew skipped: %s", exc)
        return arr


# ─────────────────────────────────────────────────────────────────────────────
# PDF β†’ IMAGES
# ─────────────────────────────────────────────────────────────────────────────

def pdf_to_images(pdf_path: Path, dpi: int = RASTER_DPI) -> list[Image.Image]:
    """Rasterise every PDF page at `dpi` DPI β†’ list of RGB PIL images."""
    try:
        pages = convert_from_path(str(pdf_path), dpi=dpi, fmt="png", thread_count=2)
        log.info("  Rasterised %d page(s) from %s", len(pages), pdf_path.name)
        return [p.convert("RGB") for p in pages]
    except PDFPageCountError:
        log.warning("  Empty PDF: %s", pdf_path.name)
        return []
    except Exception as exc:
        log.error("  pdf_to_images failed for %s: %s", pdf_path.name, exc)
        return []


# ─────────────────────────────────────────────────────────────────────────────
# OCR
# ─────────────────────────────────────────────────────────────────────────────

def run_ocr(pil_img: Image.Image, lang: str = OCR_LANG) -> dict:
    """
    Run Tesseract on a PIL image and return a structured result dict:

        words        – list of token strings
        bboxes       – pixel [x0, y0, x1, y1] per token
        bboxes_norm  – bboxes normalised to [0, 1000] for LayoutLMv3
        confs        – Tesseract confidence per token (0–100)
        full_text    – raw OCR string (whole page)
        width/height – image dimensions in pixels
    """
    config = "--oem 1 --psm 6"
    w, h   = pil_img.size

    data = pytesseract.image_to_data(
        pil_img, lang=lang, config=config, output_type=Output.DICT
    )

    words, bboxes, bboxes_norm, confs = [], [], [], []

    for i in range(len(data["text"])):
        word = data["text"][i].strip()
        conf = int(data["conf"][i])

        if not word or conf < MIN_CONF:
            continue

        x0 = max(0, data["left"][i])
        y0 = max(0, data["top"][i])
        x1 = min(w, x0 + data["width"][i])
        y1 = min(h, y0 + data["height"][i])

        if x1 <= x0 or y1 <= y0:
            continue

        words.append(word)
        bboxes.append([x0, y0, x1, y1])
        bboxes_norm.append([
            int(x0 / w * BBOX_NORM),
            int(y0 / h * BBOX_NORM),
            int(x1 / w * BBOX_NORM),
            int(y1 / h * BBOX_NORM),
        ])
        confs.append(conf)

    full_text = pytesseract.image_to_string(pil_img, lang=lang, config=config)

    return {
        "words":       words,
        "bboxes":      bboxes,
        "bboxes_norm": bboxes_norm,
        "confs":       confs,
        "full_text":   full_text.strip(),
        "width":       w,
        "height":      h,
    }


# ─────────────────────────────────────────────────────────────────────────────
# LABEL STUDIO TASK BUILDER  (fixed)
# ─────────────────────────────────────────────────────────────────────────────

def build_label_studio_task(
    image_path: Path,
    ocr_result: dict,
    doc_class: str,
    relative_image_url: Optional[str] = None,
) -> dict:
    """
    Build one Label Studio task compatible with the official OCR template.

    FIX β€” Label Studio's OCR template validates that task["data"] contains
    exactly two mandatory keys:
        "image"  β†’  URL/path of the page PNG to display
        "ocr"    β†’  the raw OCR text string (bound to the Text area widget)

    Any other keys inside "data" are allowed as metadata but those two MUST
    be present or LS throws:
        'ValidationError: "ocr" key is expected in task data'

    Pre-annotations (one rectangle + transcription per OCR token) are stored
    in "predictions" so annotators see boxes already drawn and only need to
    click a label β€” they do not redraw boxes by hand.
    """
    url = f"file:///{image_path.resolve().as_posix()}"
    w, h = ocr_result["width"], ocr_result["height"]

    results = []
    for idx, (word, (x0, y0, x1, y1)) in enumerate(
        zip(ocr_result["words"], ocr_result["bboxes"])
    ):
        # Convert pixel bbox β†’ Label Studio percentage format
        # LS uses: x, y = top-left corner (%); width, height = size (%)
        x_pct = round(x0 / w * 100, 4)
        y_pct = round(y0 / h * 100, 4)
        w_pct = round((x1 - x0) / w * 100, 4)
        h_pct = round((y1 - y0) / h * 100, 4)

        region_id = f"r{idx}"

        # ── 1. Rectangle bounding box ─────────────────────────────────────────
        results.append({
            "id":        region_id,
            "from_name": "bbox",
            "to_name":   "image",
            "type":      "rectangle",
            "value": {
                "x": x_pct, "y": y_pct,
                "width": w_pct, "height": h_pct,
                "rotation": 0,
            },
        })

        # ── 2. Transcription text (shows the OCR word inside the box) ─────────
        results.append({
            "id":        f"t{idx}",
            "from_name": "transcription",
            "to_name":   "image",
            "type":      "textarea",
            "parent_id": region_id,
            "value": {
                "x": x_pct, "y": y_pct,
                "width": w_pct, "height": h_pct,
                "rotation": 0,
                "text": [word],
            },
        })

        # ── 3. Empty label slot β€” annotator picks the entity label ────────────
        results.append({
            "id":        f"l{idx}",
            "from_name": "label",
            "to_name":   "image",
            "type":      "rectanglelabels",
            "parent_id": region_id,
            "value": {
                "x": x_pct, "y": y_pct,
                "width": w_pct, "height": h_pct,
                "rotation": 0,
                "rectanglelabels": [],   # filled by annotator
            },
        })

    return {
        "data": {
            # ── REQUIRED by Label Studio OCR template ─────────────────────────
            "image": url,                        # displayed page image
            "ocr":   ocr_result["full_text"],    # ← was missing β†’ caused the error
            # ── Extra metadata (ignored by LS UI, useful downstream) ──────────
            "doc_class":  doc_class,
            "image_file": image_path.name,
        },
        "annotations": [],
        "predictions": [{"result": results, "score": 0.0}],
    }


# ─────────────────────────────────────────────────────────────────────────────
# MAIN PIPELINE
# ─────────────────────────────────────────────────────────────────────────────

def process_document(
    src_path: Path,
    img_dir: Path,
    ocr_dir: Path,
    doc_class: str,
    ls_tasks: list,
    stem: str,
) -> int:
    """Process one source file (PDF or image). Returns pages processed."""
    ext = src_path.suffix.lower()

    if ext == ".pdf":
        pages = pdf_to_images(src_path, dpi=RASTER_DPI)
    elif ext in SUPPORTED_EXT:
        try:
            pages = [Image.open(src_path).convert("RGB")]
        except Exception as exc:
            log.error("  Cannot open %s: %s", src_path.name, exc)
            return 0
    else:
        log.warning("  Unsupported type: %s", src_path.name)
        return 0

    processed = 0
    for page_idx, page_rgb in enumerate(pages):
        page_stem = f"{stem}_p{page_idx:03d}"

        # Save raw rasterised PNG (original colours, useful for inspection)
        raw_path = img_dir / f"{page_stem}_raw.png"
        page_rgb.save(raw_path, "PNG")

        # Pre-process then save the clean version (used for OCR + LS display)
        page_proc  = preprocess_image(page_rgb)
        proc_path  = img_dir / f"{page_stem}.png"
        page_proc.save(proc_path, "PNG")

        # Run OCR
        ocr = run_ocr(page_proc, lang=OCR_LANG)
        log.info(
            "    Page %d β†’ %d tokens | %d chars",
            page_idx, len(ocr["words"]), len(ocr["full_text"]),
        )

        # Save per-page OCR JSON (used later during dataset preparation)
        ocr_payload = {
            "source_file": src_path.name,
            "doc_class":   doc_class,
            "page_index":  page_idx,
            "image_file":  proc_path.name,
            **ocr,
        }
        (ocr_dir / f"{page_stem}.json").write_text(
            json.dumps(ocr_payload, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )

        # Build & collect Label Studio task
        ls_tasks.append(build_label_studio_task(
            image_path=proc_path,
            ocr_result=ocr,
            doc_class=doc_class,
        ))

        processed += 1

    return processed


def run_pipeline(dataset_dir: Path, output_dir: Path) -> None:
    """
    Iterate dataset and process all documents.
    Supports two structures:
    1. Organized: DataSet_Autorisation/, DataSet_Certificat/, etc.
    2. Flat: All files in root with pattern-based classification (DataSet2)
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    ls_tasks: list[dict] = []
    summary:  dict[str, dict] = {}

    # Check if dataset uses organized or flat structure
    is_organized = any(
        (dataset_dir / folder_name).exists()
        for folder_name in DATASET_FOLDERS.keys()
    )

    if is_organized:
        # ── Organized structure: DataSet_* subdirectories ──────────────────────
        for folder_name, doc_class in DATASET_FOLDERS.items():
            folder_path = dataset_dir / folder_name
            if not folder_path.exists():
                log.warning("Folder not found, skipping: %s", folder_path)
                continue

            img_dir = output_dir / doc_class / "images"
            ocr_dir = output_dir / doc_class / "ocr"
            img_dir.mkdir(parents=True, exist_ok=True)
            ocr_dir.mkdir(parents=True, exist_ok=True)

            log.info("━━━ %s (%s) ━━━", doc_class, folder_name)

            files = sorted(
                f for f in folder_path.iterdir()
                if f.suffix.lower() in SUPPORTED_EXT
            )

            if not files:
                log.warning("  No supported files in %s", folder_path)
                continue

            total_pages = 0
            for src_file in files:
                log.info("  Processing: %s", src_file.name)
                n = process_document(
                    src_path=src_file,
                    img_dir=img_dir,
                    ocr_dir=ocr_dir,
                    doc_class=doc_class,
                    ls_tasks=ls_tasks,
                    stem=_safe_stem(src_file.stem),
                )
                total_pages += n

            summary[doc_class] = {"files": len(files), "pages": total_pages}
            log.info("  β†’ %d file(s), %d page(s)", len(files), total_pages)

    else:
        # ── Flat structure: Files at root, classified by pattern ──────────────
        log.info("━━━ Flat dataset structure (pattern-based classification) ━━━")
        
        files = sorted(
            f for f in dataset_dir.iterdir()
            if f.is_file() and f.suffix.lower() in SUPPORTED_EXT
        )

        if not files:
            log.warning("  No supported files in %s", dataset_dir)
        else:
            # Group files by classification
            classified: dict[str, list[Path]] = {doc_class: [] for doc_class in LABEL_PATTERNS.keys()}
            classified["_unclassified"] = []

            for src_file in files:
                doc_class = _classify_file(src_file.name)
                if doc_class:
                    classified[doc_class].append(src_file)
                else:
                    classified["_unclassified"].append(src_file)

            # Process each class
            for doc_class, class_files in classified.items():
                if not class_files:
                    continue

                # Skip unclassified for now (can be logged separately if needed)
                if doc_class == "_unclassified":
                    if class_files:
                        log.warning("  Unclassified (%d files): %s", 
                                   len(class_files), 
                                   ", ".join(f.name for f in class_files[:3]))
                    continue

                img_dir = output_dir / doc_class / "images"
                ocr_dir = output_dir / doc_class / "ocr"
                img_dir.mkdir(parents=True, exist_ok=True)
                ocr_dir.mkdir(parents=True, exist_ok=True)

                log.info("  %s (%d files)", doc_class, len(class_files))

                total_pages = 0
                for src_file in class_files:
                    log.info("    Processing: %s", src_file.name)
                    n = process_document(
                        src_path=src_file,
                        img_dir=img_dir,
                        ocr_dir=ocr_dir,
                        doc_class=doc_class,
                        ls_tasks=ls_tasks,
                        stem=_safe_stem(src_file.stem),
                    )
                    total_pages += n

                summary[doc_class] = {"files": len(class_files), "pages": total_pages}
                log.info("    β†’ %d page(s)", total_pages)

    # Write Label Studio import file
    ls_path = output_dir / "label_studio_tasks.json"
    ls_path.write_text(
        json.dumps(ls_tasks, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    log.info("Label Studio tasks β†’ %s  (%d tasks)", ls_path, len(ls_tasks))

    # Print summary table
    print("\n" + "═" * 50)
    print(f"  {'Class':<22} {'Files':>6}  {'Pages':>6}")
    print("─" * 50)
    total_f = total_p = 0
    for cls, s in summary.items():
        print(f"  {cls:<22} {s['files']:>6}  {s['pages']:>6}")
        total_f += s["files"]
        total_p += s["pages"]
    print("─" * 50)
    print(f"  {'TOTAL':<22} {total_f:>6}  {total_p:>6}")
    print("═" * 50 + "\n")


# ─────────────────────────────────────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────────────────────────────────────

def _safe_stem(name: str) -> str:
    """Normalise a filename stem to ASCII-safe, space-free form."""
    nfkd = unicodedata.normalize("NFKD", name)
    ascii_str = nfkd.encode("ascii", "ignore").decode("ascii")
    return re.sub(r"[^\w\-]", "_", ascii_str)


def _classify_file(filename: str) -> Optional[str]:
    """Classify a file by filename pattern matching. Returns doc_class or None."""
    filename_lower = filename.lower()
    for doc_class, pattern in LABEL_PATTERNS.items():
        if re.search(pattern, filename_lower):
            return doc_class
    return None


def validate_classification(dataset_dir: Path) -> None:
    """Test and display classification results without processing files."""
    files = sorted(
        f for f in dataset_dir.iterdir()
        if f.is_file() and f.suffix.lower() in SUPPORTED_EXT
    )

    if not files:
        log.warning("No supported files in %s", dataset_dir)
        return

    classified: dict[str, list[str]] = {doc_class: [] for doc_class in LABEL_PATTERNS.keys()}
    classified["_unclassified"] = []

    for src_file in files:
        doc_class = _classify_file(src_file.name)
        if doc_class:
            classified[doc_class].append(src_file.name)
        else:
            classified["_unclassified"].append(src_file.name)

    # Print results
    print("\n" + "═" * 70)
    print(f"  CLASSIFICATION VALIDATION  ({len(files)} files)")
    print("═" * 70)
    
    total = 0
    for doc_class in list(LABEL_PATTERNS.keys()) + ["_unclassified"]:
        files_in_class = classified[doc_class]
        if files_in_class:
            display_class = "UNCLASSIFIED" if doc_class == "_unclassified" else doc_class
            print(f"\n  {display_class}  ({len(files_in_class)} files)")
            print("  " + "─" * 66)
            for fname in files_in_class[:10]:  # Show first 10
                print(f"    β€’ {fname}")
            if len(files_in_class) > 10:
                print(f"    ... and {len(files_in_class) - 10} more")
            total += len(files_in_class)
    
    print("\n" + "═" * 70 + "\n")


# ─────────────────────────────────────────────────────────────────────────────
# CLI
# ─────────────────────────────────────────────────────────────────────────────

def _parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Rasterise + OCR for GuichetOI_ML")
    p.add_argument("--dataset_dir", type=Path, default=Path("DataRef"))
    p.add_argument("--output_dir",  type=Path, default=Path("processed_dataref"))
    p.add_argument("--dpi",      type=int, default=RASTER_DPI)
    p.add_argument("--lang",     type=str, default=OCR_LANG)
    p.add_argument("--min_conf", type=int, default=MIN_CONF)
    p.add_argument("--validate", action="store_true", help="Only validate classification, don't process files")
    return p.parse_args()


if __name__ == "__main__":
    args = _parse_args()
    RASTER_DPI = args.dpi
    OCR_LANG   = args.lang
    MIN_CONF   = args.min_conf

    log.info("Dataset : %s", args.dataset_dir.resolve())
    log.info("Output  : %s", args.output_dir.resolve())
    log.info("DPI=%d  lang=%s  min_conf=%d", RASTER_DPI, OCR_LANG, MIN_CONF)

    if args.validate:
        log.info("Running classification validation (no files will be processed)")
        validate_classification(dataset_dir=args.dataset_dir)
    else:
        run_pipeline(dataset_dir=args.dataset_dir, output_dir=args.output_dir)