rbaks commited on
Commit
d7f1cd8
Β·
verified Β·
1 Parent(s): 941b252

Upload document_readability.py

Browse files
Files changed (1) hide show
  1. document_readability.py +544 -0
document_readability.py ADDED
@@ -0,0 +1,544 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Readability Scorer
3
+ ============================
4
+ A multi-signal pre-screening system for document validation pipelines.
5
+ Scores documents on readability before expensive OCR/LLM inference.
6
+
7
+ Signals extracted (all normalized to 0-1, higher = better):
8
+ 1. Sharpness β€” Laplacian variance + FFT high-freq energy
9
+ 2. Contrast β€” RMS contrast + Michelson contrast
10
+ 3. Noise level β€” Estimated noise sigma (inverted: low noise = high score)
11
+ 4. Text presence β€” MSER-based text region coverage + edge density
12
+ 5. Brightness β€” Penalizes over/under-exposed documents
13
+ 6. Entropy β€” Shannon entropy (blank pages score low)
14
+ 7. Learned IQA β€” CLIP-IQA or BRISQUE via pyiqa (optional, GPU-free)
15
+
16
+ The composite "readability_score" is a weighted sum of these signals.
17
+ Weights are fully configurable for calibration to your pipeline.
18
+
19
+ Usage:
20
+ scorer = DocumentReadabilityScorer()
21
+ result = scorer.score("document.png")
22
+ print(result["readability_score"]) # float in [0, 1]
23
+ print(result["ocr_recommended"]) # bool
24
+ print(result["signals"]) # dict of all sub-scores
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import warnings
30
+ from dataclasses import dataclass, field
31
+ from pathlib import Path
32
+ from typing import Optional, Union
33
+
34
+ import cv2
35
+ import numpy as np
36
+ from PIL import Image
37
+ from scipy import ndimage
38
+ from skimage.filters import sobel
39
+ from skimage.measure import shannon_entropy
40
+
41
+ warnings.filterwarnings("ignore", category=UserWarning)
42
+
43
+
44
+ # ─── Configuration ───────────────────────────────────────────────────────────
45
+
46
+ @dataclass
47
+ class ScorerConfig:
48
+ """Weights and thresholds for the readability scorer.
49
+
50
+ All weights should sum to 1.0. Adjust these to calibrate
51
+ the scorer for your specific document types.
52
+ """
53
+ # Signal weights (must sum to 1.0)
54
+ w_sharpness: float = 0.30
55
+ w_contrast: float = 0.15
56
+ w_noise: float = 0.10
57
+ w_text_presence: float = 0.15
58
+ w_brightness: float = 0.05
59
+ w_entropy: float = 0.10
60
+ w_learned_iqa: float = 0.15
61
+
62
+ # Decision threshold
63
+ ocr_threshold: float = 0.45 # below this β†’ skip OCR
64
+
65
+ # Normalization constants (tune per your doc distribution)
66
+ laplacian_cap: float = 800.0 # laplacian var at which sharpness = 1.0
67
+ noise_cap: float = 15.0 # noise sigma at which noise_score = 0.0
68
+ min_text_coverage: float = 0.01 # below this β†’ likely blank
69
+
70
+ # Learned metric to use (set to None to disable)
71
+ learned_metric: Optional[str] = "clipiqa" # "clipiqa", "brisque", "niqe", "topiq_nr", None
72
+
73
+ # Whether to use GPU for learned metrics
74
+ device: str = "cpu"
75
+
76
+ def validate(self):
77
+ total = (self.w_sharpness + self.w_contrast + self.w_noise +
78
+ self.w_text_presence + self.w_brightness + self.w_entropy +
79
+ self.w_learned_iqa)
80
+ if abs(total - 1.0) > 0.01:
81
+ raise ValueError(f"Weights must sum to 1.0, got {total:.3f}")
82
+
83
+
84
+ # ─── Signal Extractors ──────────────────────────────────────────────────────
85
+
86
+ def _load_gray(image: Union[str, Path, np.ndarray, Image.Image]) -> np.ndarray:
87
+ """Load image as grayscale numpy array."""
88
+ if isinstance(image, (str, Path)):
89
+ img = cv2.imread(str(image))
90
+ if img is None:
91
+ raise FileNotFoundError(f"Cannot read image: {image}")
92
+ return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
93
+ elif isinstance(image, Image.Image):
94
+ return np.array(image.convert("L"))
95
+ elif isinstance(image, np.ndarray):
96
+ if image.ndim == 3:
97
+ return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
98
+ return image
99
+ raise TypeError(f"Unsupported image type: {type(image)}")
100
+
101
+
102
+ def _load_color(image: Union[str, Path, np.ndarray, Image.Image]) -> np.ndarray:
103
+ """Load image as BGR numpy array."""
104
+ if isinstance(image, (str, Path)):
105
+ img = cv2.imread(str(image))
106
+ if img is None:
107
+ raise FileNotFoundError(f"Cannot read image: {image}")
108
+ return img
109
+ elif isinstance(image, Image.Image):
110
+ return cv2.cvtColor(np.array(image.convert("RGB")), cv2.COLOR_RGB2BGR)
111
+ elif isinstance(image, np.ndarray):
112
+ return image
113
+ raise TypeError(f"Unsupported image type: {type(image)}")
114
+
115
+
116
+ def sharpness_score(gray: np.ndarray, laplacian_cap: float = 800.0) -> dict:
117
+ """
118
+ Sharpness via Laplacian variance + FFT high-frequency energy.
119
+
120
+ Laplacian variance: measures second-derivative magnitude.
121
+ - Sharp document text: 200-2000+
122
+ - Moderately blurry: 50-200
123
+ - Very blurry: <50
124
+
125
+ FFT energy ratio: fraction of spectral energy in high frequencies.
126
+ """
127
+ # Laplacian variance
128
+ lap = cv2.Laplacian(gray, cv2.CV_64F)
129
+ lap_var = float(lap.var())
130
+ lap_norm = min(lap_var / laplacian_cap, 1.0)
131
+
132
+ # FFT-based: ratio of high-freq energy to total energy
133
+ h, w = gray.shape
134
+ f = np.fft.fft2(gray.astype(np.float64))
135
+ fshift = np.fft.fftshift(f)
136
+ magnitude = np.abs(fshift)
137
+ total_energy = magnitude.sum()
138
+
139
+ # Create low-freq mask (center circle, radius = 5% of min dimension)
140
+ cy, cx = h // 2, w // 2
141
+ radius = int(min(h, w) * 0.05)
142
+ Y, X = np.ogrid[:h, :w]
143
+ low_freq_mask = ((Y - cy) ** 2 + (X - cx) ** 2) <= radius ** 2
144
+ low_energy = magnitude[low_freq_mask].sum()
145
+ high_freq_ratio = float(1.0 - low_energy / (total_energy + 1e-10))
146
+
147
+ # Combined sharpness: 70% Laplacian + 30% FFT
148
+ combined = 0.7 * lap_norm + 0.3 * high_freq_ratio
149
+
150
+ return {
151
+ "sharpness": float(np.clip(combined, 0, 1)),
152
+ "laplacian_variance": lap_var,
153
+ "high_freq_ratio": high_freq_ratio,
154
+ }
155
+
156
+
157
+ def contrast_score(gray: np.ndarray) -> dict:
158
+ """
159
+ Contrast via RMS and Michelson metrics.
160
+
161
+ Good documents have RMS contrast ~0.2-0.5 (black text on white).
162
+ Washed-out or very dark scans have low contrast.
163
+ """
164
+ # RMS contrast
165
+ rms = float(gray.std() / 255.0)
166
+
167
+ # Michelson contrast
168
+ i_max, i_min = float(gray.max()), float(gray.min())
169
+ michelson = (i_max - i_min) / (i_max + i_min + 1e-10)
170
+
171
+ # Normalize: RMS of 0.25+ is good for documents
172
+ rms_norm = min(rms / 0.30, 1.0)
173
+ mich_norm = michelson # already in [0, 1]
174
+
175
+ combined = 0.6 * rms_norm + 0.4 * mich_norm
176
+
177
+ return {
178
+ "contrast": float(np.clip(combined, 0, 1)),
179
+ "rms_contrast": rms,
180
+ "michelson_contrast": float(michelson),
181
+ }
182
+
183
+
184
+ def noise_score(gray: np.ndarray, noise_cap: float = 15.0) -> dict:
185
+ """
186
+ Noise estimation via Immerkær (1996) method.
187
+ Uses a 3x3 Laplacian kernel on the image to isolate high-frequency noise.
188
+
189
+ Clean documents: sigma < 3
190
+ Noisy scans: sigma 5-15
191
+ Very noisy: sigma > 15
192
+ """
193
+ H = np.array([[1, -2, 1], [-2, 4, -2], [1, -2, 1]], dtype=np.float64)
194
+ filtered = ndimage.convolve(gray.astype(np.float64), H)
195
+ sigma = float(np.abs(filtered).mean() * np.sqrt(np.pi / 2) / 6.0)
196
+
197
+ # Invert: low noise = high score
198
+ noise_norm = 1.0 - min(sigma / noise_cap, 1.0)
199
+
200
+ return {
201
+ "noise": float(np.clip(noise_norm, 0, 1)),
202
+ "noise_sigma": sigma,
203
+ }
204
+
205
+
206
+ def text_presence_score(gray: np.ndarray, min_coverage: float = 0.01) -> dict:
207
+ """
208
+ Text presence via MSER regions + edge density.
209
+
210
+ MSER (Maximally Stable Extremal Regions) detects text-like blobs.
211
+ Edge density via Sobel measures structural content.
212
+ """
213
+ # MSER text region detection
214
+ mser = cv2.MSER_create()
215
+ mser.setDelta(5)
216
+ mser.setMinArea(30)
217
+ mser.setMaxArea(int(gray.size * 0.05))
218
+ mser.setMaxVariation(0.25)
219
+ try:
220
+ regions, _ = mser.detectRegions(gray)
221
+ except cv2.error:
222
+ regions = []
223
+
224
+ if regions:
225
+ mask = np.zeros_like(gray)
226
+ for r in regions:
227
+ hull = cv2.convexHull(r.reshape(-1, 1, 2))
228
+ cv2.fillPoly(mask, [hull], 255)
229
+ text_coverage = float(mask.sum() / (255.0 * mask.size))
230
+ else:
231
+ text_coverage = 0.0
232
+
233
+ # Edge density via Sobel
234
+ gray_float = gray.astype(np.float64) / 255.0
235
+ edges = sobel(gray_float)
236
+ edge_density = float(edges.mean())
237
+
238
+ # Normalize: coverage >5% is good, edges >0.05 is good
239
+ cov_norm = min(text_coverage / 0.10, 1.0)
240
+ edge_norm = min(edge_density / 0.08, 1.0)
241
+
242
+ combined = 0.5 * cov_norm + 0.5 * edge_norm
243
+ has_text = text_coverage > min_coverage or edge_density > 0.02
244
+
245
+ return {
246
+ "text_presence": float(np.clip(combined, 0, 1)),
247
+ "text_coverage": text_coverage,
248
+ "edge_density": edge_density,
249
+ "has_text": has_text,
250
+ }
251
+
252
+
253
+ def brightness_score(gray: np.ndarray) -> dict:
254
+ """
255
+ Brightness assessment β€” penalizes over/under-exposure.
256
+
257
+ Ideal document: mean brightness ~160-245 (white paper, dark text).
258
+ Score drops for very dark (<80) or fully saturated (==255 everywhere).
259
+
260
+ Note: Documents naturally have many white pixels (paper background).
261
+ White paper with mean brightness ~240-250 is normal and good.
262
+ """
263
+ mean_brightness = float(gray.mean())
264
+
265
+ # Fraction of truly problematic pixels
266
+ dark_frac = float((gray < 15).sum() / gray.size) # crushed to black
267
+ pure_white_frac = float((gray == 255).sum() / gray.size) # fully saturated
268
+
269
+ # Score mapping for documents:
270
+ # Very dark (<60): bad
271
+ # Dim (60-140): mediocre
272
+ # Normal (140-250): good (peak at 200-220, but 240-250 is still fine)
273
+ # Pure white (>252): suspicious
274
+ if mean_brightness < 60:
275
+ bright_norm = mean_brightness / 60.0 * 0.3
276
+ elif mean_brightness < 140:
277
+ bright_norm = 0.3 + (mean_brightness - 60) / 80.0 * 0.5
278
+ elif mean_brightness <= 250:
279
+ # Wide sweet spot for documents: 140-250 is all good
280
+ # Peak at 200, but gentle falloff
281
+ dist_from_ideal = abs(mean_brightness - 200) / 60.0
282
+ bright_norm = 1.0 - dist_from_ideal * 0.2 # at 250: 0.83, at 140: 0.80
283
+ else:
284
+ # Over 250 β€” nearly blank white
285
+ bright_norm = max(0.4, 1.0 - (mean_brightness - 250) / 5.0)
286
+
287
+ # Only penalize if image is mostly crushed blacks or ALL pure white
288
+ # (pure_white_frac of 0.9 on a text doc is fine β€” paper is white)
289
+ exposure_penalty = min(dark_frac * 3 + max(0, pure_white_frac - 0.95) * 5, 0.5)
290
+ bright_norm = max(0, bright_norm - exposure_penalty)
291
+
292
+ return {
293
+ "brightness": float(np.clip(bright_norm, 0, 1)),
294
+ "mean_brightness": mean_brightness,
295
+ "dark_pixel_frac": dark_frac,
296
+ "bright_pixel_frac": pure_white_frac,
297
+ }
298
+
299
+
300
+ def entropy_score(gray: np.ndarray) -> dict:
301
+ """
302
+ Shannon entropy β€” measures information content.
303
+
304
+ Blank/uniform pages: entropy ~0-3
305
+ Text documents: entropy ~5-7
306
+ Complex images: entropy ~7-8
307
+ """
308
+ ent = float(shannon_entropy(gray))
309
+
310
+ # Normalize: entropy of 4+ is good for documents (lower threshold than natural images)
311
+ # Blank page: ~0-2, simple doc: 3-5, rich doc: 5-7
312
+ ent_norm = min(ent / 5.5, 1.0)
313
+
314
+ return {
315
+ "entropy": float(np.clip(ent_norm, 0, 1)),
316
+ "shannon_entropy": ent,
317
+ }
318
+
319
+
320
+ # ─── Learned IQA (optional) ─────────────────────────────────────────────────
321
+
322
+ _iqa_cache: dict = {}
323
+
324
+ def learned_iqa_score(
325
+ image: Union[str, Path, np.ndarray, Image.Image],
326
+ metric_name: str = "clipiqa",
327
+ device: str = "cpu",
328
+ ) -> dict:
329
+ """
330
+ Learned no-reference IQA via pyiqa library.
331
+
332
+ Supported metrics (all run on CPU):
333
+ - clipiqa: CLIP-IQA (0-1, higher=better)
334
+ - brisque: BRISQUE (0-100, lower=better, we invert)
335
+ - niqe: NIQE (lower=better, we invert)
336
+ - topiq_nr: TOPIQ-NR (0-1, higher=better)
337
+ """
338
+ import torch
339
+ import pyiqa
340
+
341
+ cache_key = f"{metric_name}_{device}"
342
+ if cache_key not in _iqa_cache:
343
+ _iqa_cache[cache_key] = pyiqa.create_metric(metric_name, device=device)
344
+
345
+ metric = _iqa_cache[cache_key]
346
+ lower_better = metric.lower_better
347
+
348
+ # Convert to tensor
349
+ if isinstance(image, (str, Path)):
350
+ pil_img = Image.open(str(image)).convert("RGB")
351
+ elif isinstance(image, np.ndarray):
352
+ if image.ndim == 2:
353
+ pil_img = Image.fromarray(image).convert("RGB")
354
+ else:
355
+ pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
356
+ elif isinstance(image, Image.Image):
357
+ pil_img = image.convert("RGB")
358
+ else:
359
+ raise TypeError(f"Unsupported type: {type(image)}")
360
+
361
+ # Resize for speed (IQA doesn't need full resolution)
362
+ max_dim = 512
363
+ w, h = pil_img.size
364
+ if max(w, h) > max_dim:
365
+ scale = max_dim / max(w, h)
366
+ pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
367
+
368
+ img_tensor = torch.from_numpy(
369
+ np.array(pil_img).transpose(2, 0, 1)
370
+ ).float().unsqueeze(0) / 255.0
371
+ img_tensor = img_tensor.to(device)
372
+
373
+ with torch.no_grad():
374
+ raw_score = float(metric(img_tensor).item())
375
+
376
+ # Normalize to [0, 1] higher=better
377
+ if lower_better:
378
+ if metric_name == "brisque":
379
+ normalized = float(np.clip(1.0 - raw_score / 100.0, 0, 1))
380
+ elif metric_name == "niqe":
381
+ normalized = float(np.clip(1.0 - raw_score / 20.0, 0, 1))
382
+ else:
383
+ normalized = float(np.clip(1.0 - raw_score / 50.0, 0, 1))
384
+ else:
385
+ normalized = float(np.clip(raw_score, 0, 1))
386
+
387
+ return {
388
+ "learned_iqa": normalized,
389
+ f"{metric_name}_raw": raw_score,
390
+ "metric_name": metric_name,
391
+ }
392
+
393
+
394
+ # ─── Main Scorer ─────────────────────────────────────────────────────────────
395
+
396
+ @dataclass
397
+ class ReadabilityResult:
398
+ """Complete readability assessment for a document image."""
399
+ readability_score: float # Composite score [0, 1]
400
+ ocr_recommended: bool # Whether to proceed with OCR
401
+ confidence_label: str # "excellent" / "good" / "fair" / "poor" / "bad"
402
+ signals: dict # All individual signal scores and raw values
403
+ config: dict # Config used for this scoring
404
+
405
+ def to_dict(self) -> dict:
406
+ return {
407
+ "readability_score": self.readability_score,
408
+ "ocr_recommended": self.ocr_recommended,
409
+ "confidence_label": self.confidence_label,
410
+ "signals": self.signals,
411
+ }
412
+
413
+
414
+ class DocumentReadabilityScorer:
415
+ """
416
+ Multi-signal document readability scorer.
417
+
418
+ Example:
419
+ scorer = DocumentReadabilityScorer()
420
+ result = scorer.score("scan.pdf")
421
+ if result.ocr_recommended:
422
+ run_ocr(...)
423
+ else:
424
+ log_rejected(result.signals)
425
+ """
426
+
427
+ def __init__(self, config: Optional[ScorerConfig] = None):
428
+ self.config = config or ScorerConfig()
429
+ self.config.validate()
430
+
431
+ def score(
432
+ self,
433
+ image: Union[str, Path, np.ndarray, Image.Image],
434
+ ) -> ReadabilityResult:
435
+ """
436
+ Score a document image for readability.
437
+
438
+ Args:
439
+ image: File path, numpy array (BGR or gray), or PIL Image.
440
+
441
+ Returns:
442
+ ReadabilityResult with composite score, sub-signals, and recommendation.
443
+ """
444
+ cfg = self.config
445
+ gray = _load_gray(image)
446
+
447
+ # Extract all classical signals
448
+ sharp = sharpness_score(gray, cfg.laplacian_cap)
449
+ cont = contrast_score(gray)
450
+ noi = noise_score(gray, cfg.noise_cap)
451
+ text = text_presence_score(gray, cfg.min_text_coverage)
452
+ bright = brightness_score(gray)
453
+ ent = entropy_score(gray)
454
+
455
+ # Optional learned IQA
456
+ if cfg.learned_metric:
457
+ try:
458
+ iqa = learned_iqa_score(image, cfg.learned_metric, cfg.device)
459
+ except Exception as e:
460
+ # Fall back gracefully β€” redistribute weight to sharpness
461
+ iqa = {"learned_iqa": 0.5, "error": str(e), "metric_name": cfg.learned_metric}
462
+ else:
463
+ iqa = {"learned_iqa": 0.5, "metric_name": "disabled"}
464
+
465
+ # Composite score
466
+ composite = (
467
+ cfg.w_sharpness * sharp["sharpness"] +
468
+ cfg.w_contrast * cont["contrast"] +
469
+ cfg.w_noise * noi["noise"] +
470
+ cfg.w_text_presence * text["text_presence"] +
471
+ cfg.w_brightness * bright["brightness"] +
472
+ cfg.w_entropy * ent["entropy"] +
473
+ cfg.w_learned_iqa * iqa["learned_iqa"]
474
+ )
475
+ composite = float(np.clip(composite, 0, 1))
476
+
477
+ # Label
478
+ if composite >= 0.80:
479
+ label = "excellent"
480
+ elif composite >= 0.60:
481
+ label = "good"
482
+ elif composite >= 0.40:
483
+ label = "fair"
484
+ elif composite >= 0.20:
485
+ label = "poor"
486
+ else:
487
+ label = "bad"
488
+
489
+ # Merge all signals
490
+ signals = {}
491
+ for d in [sharp, cont, noi, text, bright, ent, iqa]:
492
+ signals.update(d)
493
+
494
+ return ReadabilityResult(
495
+ readability_score=round(composite, 4),
496
+ ocr_recommended=composite >= cfg.ocr_threshold,
497
+ confidence_label=label,
498
+ signals=signals,
499
+ config={
500
+ "weights": {
501
+ "sharpness": cfg.w_sharpness,
502
+ "contrast": cfg.w_contrast,
503
+ "noise": cfg.w_noise,
504
+ "text_presence": cfg.w_text_presence,
505
+ "brightness": cfg.w_brightness,
506
+ "entropy": cfg.w_entropy,
507
+ "learned_iqa": cfg.w_learned_iqa,
508
+ },
509
+ "ocr_threshold": cfg.ocr_threshold,
510
+ "learned_metric": cfg.learned_metric or "disabled",
511
+ },
512
+ )
513
+
514
+
515
+ # ─── Batch processing helper ─────────────────────────────────────────────────
516
+
517
+ def score_batch(
518
+ image_paths: list[Union[str, Path]],
519
+ config: Optional[ScorerConfig] = None,
520
+ sort_by_score: bool = True,
521
+ ) -> list[dict]:
522
+ """Score a batch of documents and optionally sort by readability."""
523
+ scorer = DocumentReadabilityScorer(config)
524
+ results = []
525
+ for path in image_paths:
526
+ try:
527
+ result = scorer.score(path)
528
+ results.append({
529
+ "path": str(path),
530
+ **result.to_dict(),
531
+ })
532
+ except Exception as e:
533
+ results.append({
534
+ "path": str(path),
535
+ "readability_score": 0.0,
536
+ "ocr_recommended": False,
537
+ "confidence_label": "error",
538
+ "error": str(e),
539
+ })
540
+
541
+ if sort_by_score:
542
+ results.sort(key=lambda x: x["readability_score"], reverse=True)
543
+
544
+ return results