ahczhg commited on
Commit
3354205
·
verified ·
1 Parent(s): 2f0eb82

Upload agentic_ocr_extractor.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. agentic_ocr_extractor.py +617 -0
agentic_ocr_extractor.py ADDED
@@ -0,0 +1,617 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Lightweight Agentic OCR Document Extraction (Tesseract)
3
+
4
+ A lightweight, agentic OCR pipeline to extract text and structured fields from document images.
5
+
6
+ Key features:
7
+ - Multiple preprocessing variants (grayscale, thresholding, sharpening, denoise, resize)
8
+ - Multiple Tesseract page segmentation modes (PSM)
9
+ - Candidate scoring via average OCR confidence
10
+ - Simple rule-based field extraction (DOI, title, authors, abstract, keywords)
11
+ """
12
+
13
+ import os
14
+ import re
15
+ import json
16
+ import argparse
17
+ import unicodedata
18
+ from dataclasses import dataclass, asdict
19
+ from typing import Dict, List, Tuple, Optional, Any
20
+ from concurrent.futures import ThreadPoolExecutor, as_completed
21
+
22
+ import numpy as np
23
+ import cv2
24
+ from PIL import Image
25
+
26
+ import pytesseract
27
+ from pytesseract import Output
28
+
29
+
30
+ # ============================================================================
31
+ # Preprocessing Variants
32
+ # ============================================================================
33
+
34
+ def _ensure_uint8(img: np.ndarray) -> np.ndarray:
35
+ """Ensure image is uint8 dtype, clipping values if needed."""
36
+ if img.dtype == np.uint8:
37
+ return img
38
+ return np.clip(img, 0, 255).astype(np.uint8)
39
+
40
+
41
+ def preprocess_variants(rgb_img: np.ndarray, scale_factor: float = 1.5) -> Dict[str, np.ndarray]:
42
+ """Generate multiple preprocessing variants for OCR."""
43
+ variants: Dict[str, np.ndarray] = {}
44
+
45
+ # Base
46
+ variants['raw'] = rgb_img
47
+
48
+ # Upscale (often improves OCR on smaller text)
49
+ h, w = rgb_img.shape[:2]
50
+ up = cv2.resize(rgb_img, (int(w * scale_factor), int(h * scale_factor)), interpolation=cv2.INTER_CUBIC)
51
+ variants['upscaled'] = up
52
+
53
+ # Grayscale
54
+ gray = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2GRAY)
55
+ variants['gray'] = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
56
+
57
+ # Otsu threshold
58
+ _, th_otsu = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
59
+ variants['otsu'] = cv2.cvtColor(th_otsu, cv2.COLOR_GRAY2RGB)
60
+
61
+ # Adaptive threshold
62
+ th_adapt = cv2.adaptiveThreshold(
63
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 11
64
+ )
65
+ variants['adaptive'] = cv2.cvtColor(th_adapt, cv2.COLOR_GRAY2RGB)
66
+
67
+ # Denoise
68
+ den = cv2.fastNlMeansDenoising(gray, None, h=15, templateWindowSize=7, searchWindowSize=21)
69
+ variants['denoise'] = cv2.cvtColor(den, cv2.COLOR_GRAY2RGB)
70
+
71
+ # Sharpen
72
+ kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]], dtype=np.float32)
73
+ sharp = cv2.filter2D(gray, -1, kernel)
74
+ variants['sharpen'] = cv2.cvtColor(_ensure_uint8(sharp), cv2.COLOR_GRAY2RGB)
75
+
76
+ # Contrast stretch (CLAHE for better local contrast)
77
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
78
+ enhanced = clahe.apply(gray)
79
+ variants['clahe'] = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2RGB)
80
+
81
+ # Morphological closing (helps with broken characters)
82
+ kernel_morph = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
83
+ closed = cv2.morphologyEx(th_otsu, cv2.MORPH_CLOSE, kernel_morph)
84
+ variants['morph_close'] = cv2.cvtColor(closed, cv2.COLOR_GRAY2RGB)
85
+
86
+ return variants
87
+
88
+
89
+ # ============================================================================
90
+ # OCR Functions
91
+ # ============================================================================
92
+
93
+ def ocr_with_confidence(rgb_img: np.ndarray, psm: int = 6) -> Tuple[str, float, int]:
94
+ """Run OCR and return text, average confidence, and word count."""
95
+ cfg = f'--oem 3 --psm {psm}'
96
+
97
+ data = pytesseract.image_to_data(rgb_img, output_type=Output.DICT, config=cfg)
98
+ confs: List[float] = []
99
+ word_count = 0
100
+
101
+ for conf, text in zip(data.get('conf', []), data.get('text', [])):
102
+ try:
103
+ c_val = float(conf)
104
+ except (ValueError, TypeError):
105
+ continue
106
+ if not text or not str(text).strip():
107
+ continue
108
+ if c_val < 0:
109
+ continue
110
+ confs.append(c_val)
111
+ word_count += 1
112
+
113
+ avg_conf = float(np.mean(confs)) if confs else 0.0
114
+ text = pytesseract.image_to_string(rgb_img, config=cfg)
115
+ return text, avg_conf, word_count
116
+
117
+
118
+ @dataclass
119
+ class OcrCandidate:
120
+ """Container for OCR candidate results."""
121
+ variant: str
122
+ psm: int
123
+ avg_conf: float
124
+ text: str
125
+ word_count: int
126
+ score: float # Combined score for ranking
127
+
128
+
129
+ def compute_score(avg_conf: float, text: str, word_count: int) -> float:
130
+ """Compute a combined score factoring in confidence, length, and word count."""
131
+ length = len(text.strip())
132
+
133
+ # Base score is confidence
134
+ score = avg_conf
135
+
136
+ # Penalize very short outputs (likely OCR failure)
137
+ if length < 40:
138
+ score *= 0.5
139
+ elif length < 100:
140
+ score *= 0.8
141
+
142
+ # Bonus for reasonable word counts (indicates successful text extraction)
143
+ if word_count > 20:
144
+ score *= 1.1
145
+
146
+ return score
147
+
148
+
149
+ def _process_variant(args: Tuple[str, np.ndarray, int]) -> OcrCandidate:
150
+ """Process a single variant/psm combination (for parallel execution)."""
151
+ vname, vimg, psm = args
152
+ text, avg_conf, word_count = ocr_with_confidence(vimg, psm=psm)
153
+ score = compute_score(avg_conf, text, word_count)
154
+ return OcrCandidate(
155
+ variant=vname, psm=psm, avg_conf=avg_conf,
156
+ text=text, word_count=word_count, score=score
157
+ )
158
+
159
+
160
+ def run_agent(
161
+ rgb_img: np.ndarray,
162
+ psms: List[int] = None,
163
+ scale_factor: float = 1.5,
164
+ parallel: bool = True,
165
+ top_k: int = 10,
166
+ verbose: bool = True
167
+ ) -> OcrCandidate:
168
+ """Run agentic OCR with multiple variants and PSMs, return best candidate."""
169
+ if psms is None:
170
+ psms = [3, 4, 6, 11]
171
+
172
+ variants = preprocess_variants(rgb_img, scale_factor=scale_factor)
173
+
174
+ # Build task list
175
+ tasks = [(vname, vimg, psm) for vname, vimg in variants.items() for psm in psms]
176
+
177
+ candidates: List[OcrCandidate] = []
178
+
179
+ if parallel:
180
+ with ThreadPoolExecutor(max_workers=min(8, len(tasks))) as executor:
181
+ futures = [executor.submit(_process_variant, task) for task in tasks]
182
+ for future in as_completed(futures):
183
+ candidates.append(future.result())
184
+ else:
185
+ for task in tasks:
186
+ candidates.append(_process_variant(task))
187
+
188
+ # Sort by combined score (descending)
189
+ candidates.sort(key=lambda c: c.score, reverse=True)
190
+
191
+ # Print leaderboard
192
+ if verbose:
193
+ print(f'Top {top_k} OCR candidates:')
194
+ print('-' * 90)
195
+ for c in candidates[:top_k]:
196
+ preview = c.text.strip().replace('\n', ' ')[:60]
197
+ print(f"{c.variant:12s} psm={c.psm:<2d} conf={c.avg_conf:5.1f} "
198
+ f"words={c.word_count:3d} score={c.score:5.1f} '{preview}...'")
199
+ print('-' * 90)
200
+
201
+ return candidates[0]
202
+
203
+
204
+ # ============================================================================
205
+ # Text Cleaning Utilities
206
+ # ============================================================================
207
+
208
+ def clean_text(text: str) -> str:
209
+ """Clean and normalize OCR text output."""
210
+ # Normalize line endings (handle \r\n, \r, etc.)
211
+ text = text.replace('\r\n', '\n').replace('\r', '\n')
212
+
213
+ # Normalize whitespace (tabs, multiple spaces -> single space)
214
+ text = re.sub(r'[^\S\n]+', ' ', text)
215
+
216
+ # Remove spaces at start/end of lines
217
+ text = re.sub(r'^ +| +$', '', text, flags=re.MULTILINE)
218
+
219
+ # Remove repeated blank lines (keep max one blank line)
220
+ text = re.sub(r'\n\s*\n+', '\n\n', text)
221
+
222
+ return text.strip()
223
+
224
+
225
+ def fix_ocr_artifacts(text: str) -> str:
226
+ """Fix common OCR misreads and artifacts."""
227
+ replacements = [
228
+ # Common character confusions
229
+ (r'\bl\b', 'I'), # lowercase L -> I (context: single letter)
230
+ (r'(?<=[a-z])0(?=[a-z])', 'o'), # 0 -> o between letters
231
+ (r'(?<=[a-z])1(?=[a-z])', 'l'), # 1 -> l between letters
232
+ (r'\bll\b', 'II'), # ll -> II (Roman numeral)
233
+ # Fix split words (hyphenation at line breaks)
234
+ (r'(\w)-\n(\w)', r'\1\2'),
235
+ # Remove stray single characters on their own lines
236
+ (r'\n[^\w\n]\n', '\n'),
237
+ # Fix multiple periods
238
+ (r'\.{2,}', '...'),
239
+ # Fix spacing around punctuation
240
+ (r'\s+([.,;:!?])', r'\1'),
241
+ (r'([.,;:!?])(?=[A-Za-z])', r'\1 '),
242
+ ]
243
+
244
+ for pattern, repl in replacements:
245
+ text = re.sub(pattern, repl, text)
246
+
247
+ return text
248
+
249
+
250
+ def normalize_unicode(text: str) -> str:
251
+ """Normalize Unicode characters to ASCII equivalents where appropriate."""
252
+ # Normalize to NFKC form (compatibility decomposition + canonical composition)
253
+ text = unicodedata.normalize('NFKC', text)
254
+
255
+ # Common Unicode replacements
256
+ replacements = {
257
+ '\u2018': "'", '\u2019': "'", # Smart quotes
258
+ '\u201c': '"', '\u201d': '"',
259
+ '\u2013': '-', '\u2014': '-', # En/em dash
260
+ '\u2026': '...', # Ellipsis
261
+ '\ufb01': 'fi', '\ufb02': 'fl', # Ligatures
262
+ '\u00a0': ' ', # Non-breaking space
263
+ }
264
+
265
+ for old, new in replacements.items():
266
+ text = text.replace(old, new)
267
+
268
+ return text
269
+
270
+
271
+ def process_ocr_text(text: str, fix_artifacts: bool = True, normalize: bool = True) -> str:
272
+ """Full text processing pipeline."""
273
+ if normalize:
274
+ text = normalize_unicode(text)
275
+ text = clean_text(text)
276
+ if fix_artifacts:
277
+ text = fix_ocr_artifacts(text)
278
+ return text
279
+
280
+
281
+ # ============================================================================
282
+ # Field Extraction
283
+ # ============================================================================
284
+
285
+ def _first_match(pattern: str, text: str, flags: int = 0) -> Optional[str]:
286
+ """Return first regex capture group match, or None."""
287
+ m = re.search(pattern, text, flags)
288
+ return m.group(1).strip() if m else None
289
+
290
+
291
+ def _all_matches(pattern: str, text: str, flags: int = 0) -> List[str]:
292
+ """Return all regex capture group matches."""
293
+ return [m.strip() for m in re.findall(pattern, text, flags) if m.strip()]
294
+
295
+
296
+ @dataclass
297
+ class ExtractedFields:
298
+ """Structured container for extracted document fields."""
299
+ doi: Optional[str] = None
300
+ issn: Optional[str] = None
301
+ volume: Optional[str] = None
302
+ issue: Optional[str] = None
303
+ year: Optional[str] = None
304
+ pages: Optional[str] = None
305
+ received: Optional[str] = None
306
+ accepted: Optional[str] = None
307
+ published: Optional[str] = None
308
+ title: Optional[str] = None
309
+ authors: Optional[List[str]] = None
310
+ affiliations: Optional[List[str]] = None
311
+ abstract: Optional[str] = None
312
+ keywords: Optional[List[str]] = None
313
+ email: Optional[str] = None
314
+
315
+ def to_dict(self) -> Dict[str, Any]:
316
+ return {k: v for k, v in asdict(self).items() if v is not None}
317
+
318
+
319
+ def extract_doi(text: str) -> Optional[str]:
320
+ """Extract DOI with multiple pattern fallbacks."""
321
+ patterns = [
322
+ r'(?:https?://)?(?:dx\.)?doi\.org/\s*(10\.[^\s]+)',
323
+ r'DOI\s*[::\u00ef\u00bc\u009a]\s*(10\.[^\s]+)',
324
+ r'\b(10\.\d{4,}/[^\s]+)',
325
+ ]
326
+ for pattern in patterns:
327
+ doi = _first_match(pattern, text, re.IGNORECASE)
328
+ if doi:
329
+ # Clean trailing punctuation
330
+ doi = re.sub(r'[.,;:)\]]+$', '', doi)
331
+ return doi
332
+ return None
333
+
334
+
335
+ def extract_identifiers(text: str) -> Dict[str, Optional[str]]:
336
+ """Extract various document identifiers."""
337
+ return {
338
+ 'issn': _first_match(r'ISSN\s*[::\u00ef\u00bc\u009a]?\s*([0-9]{4}-[0-9]{3}[0-9Xx])', text, re.IGNORECASE),
339
+ 'isbn': _first_match(r'ISBN\s*[::\u00ef\u00bc\u009a]?\s*([\d-]{10,17})', text, re.IGNORECASE),
340
+ 'pmid': _first_match(r'PMID\s*[::\u00ef\u00bc\u009a]?\s*(\d+)', text, re.IGNORECASE),
341
+ 'arxiv': _first_match(r'arXiv\s*[::\u00ef\u00bc\u009a]?\s*(\d+\.\d+)', text, re.IGNORECASE),
342
+ }
343
+
344
+
345
+ def extract_publication_info(text: str) -> Dict[str, Optional[str]]:
346
+ """Extract volume, issue, pages, year."""
347
+ return {
348
+ 'volume': _first_match(r'Vol(?:ume)?\.?\s*[::\u00ef\u00bc\u009a]?\s*(\d{1,4})', text, re.IGNORECASE),
349
+ 'issue': _first_match(r'(?:Issue|No\.?|Number)\s*[::\u00ef\u00bc\u009a]?\s*(\d{1,4})', text, re.IGNORECASE),
350
+ 'pages': _first_match(r'(?:pp?\.?|pages?)\s*[::\u00ef\u00bc\u009a]?\s*(\d+\s*[-\u2013]\s*\d+)', text, re.IGNORECASE),
351
+ 'year': _first_match(r'\b((?:19|20)\d{2})\b', text),
352
+ }
353
+
354
+
355
+ def extract_dates(text: str) -> Dict[str, Optional[str]]:
356
+ """Extract received/accepted/published dates."""
357
+ date_pattern = r'[::\u00ef\u00bc\u009a]?\s*([A-Za-z]+\.?\s+\d{1,2},?\s+\d{4}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2})'
358
+ return {
359
+ 'received': _first_match(rf'Received{date_pattern}', text, re.IGNORECASE),
360
+ 'accepted': _first_match(rf'Accepted{date_pattern}', text, re.IGNORECASE),
361
+ 'published': _first_match(rf'Published{date_pattern}', text, re.IGNORECASE),
362
+ }
363
+
364
+
365
+ def extract_abstract(text: str) -> Optional[str]:
366
+ """Extract abstract text."""
367
+ patterns = [
368
+ r'Abstract\s*[::\u00ef\u00bc\u009a]?\s*(.*?)(?=\n\s*(?:Keywords?|Key\s*words|Introduction|1\.|1\s))',
369
+ r'Abstract\s*[::\u00ef\u00bc\u009a]?\s*(.*?)(?=\n\n)',
370
+ ]
371
+ for pattern in patterns:
372
+ match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
373
+ if match:
374
+ abstract = match.group(1).strip()
375
+ if len(abstract) > 50: # Sanity check
376
+ return clean_text(abstract)
377
+ return None
378
+
379
+
380
+ def extract_keywords(text: str) -> Optional[List[str]]:
381
+ """Extract keywords list."""
382
+ pattern = r'(?:Keywords?|Key\s*words)\s*[::\u00ef\u00bc\u009a]?\s*(.*?)(?=\n\n|\n\s*[A-Z][a-z]+:|\Z)'
383
+ match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
384
+ if match:
385
+ kw_text = match.group(1).strip()
386
+ # Split on semicolon, comma, or bullet points
387
+ parts = re.split(r'[;,\u2022\u00b7]|\s{2,}', kw_text)
388
+ keywords = [p.strip().strip('.-') for p in parts if p.strip() and len(p.strip()) > 2]
389
+ return keywords if keywords else None
390
+ return None
391
+
392
+
393
+ def extract_title(lines: List[str]) -> Optional[str]:
394
+ """Extract paper title using heuristics."""
395
+ exclude_markers = {
396
+ 'journal', 'issn', 'isbn', 'volume', 'issue', 'article', 'research article',
397
+ 'department', 'university', 'corresponding', 'received', 'accepted',
398
+ 'abstract', 'keywords', 'http', 'doi', 'email', '@', 'copyright'
399
+ }
400
+
401
+ candidates = []
402
+ for i, ln in enumerate(lines[:15]): # Title usually in first 15 lines
403
+ ln_lower = ln.lower()
404
+
405
+ # Skip lines with exclude markers
406
+ if any(m in ln_lower for m in exclude_markers):
407
+ continue
408
+
409
+ # Length constraints
410
+ if not (25 <= len(ln) <= 200):
411
+ continue
412
+
413
+ # Must have multiple words
414
+ words = ln.split()
415
+ if len(words) < 4:
416
+ continue
417
+
418
+ # High letter ratio
419
+ letter_ratio = sum(c.isalpha() for c in ln) / max(1, len(ln))
420
+ if letter_ratio < 0.6:
421
+ continue
422
+
423
+ # Score: prefer earlier lines, proper capitalization, longer titles
424
+ score = 100 - i * 5 # Earlier is better
425
+ if ln[0].isupper():
426
+ score += 10
427
+ if 50 < len(ln) < 150:
428
+ score += 10
429
+
430
+ candidates.append((score, ln))
431
+
432
+ candidates.sort(reverse=True)
433
+ return candidates[0][1] if candidates else None
434
+
435
+
436
+ def extract_authors(text: str, lines: List[str], title: Optional[str]) -> Optional[List[str]]:
437
+ """Extract author names."""
438
+ # Try to find author line after title
439
+ if title and title in lines:
440
+ idx = lines.index(title)
441
+ for i in range(idx + 1, min(idx + 4, len(lines))):
442
+ candidate = lines[i]
443
+ # Authors typically have commas, "and", multiple capitalized words
444
+ if re.search(r'\b(?:and|&)\b', candidate, re.IGNORECASE) or candidate.count(',') >= 1:
445
+ # Check for name-like pattern (capitalized words)
446
+ caps = re.findall(r'\b[A-Z][a-z]+\b', candidate)
447
+ if len(caps) >= 2:
448
+ # Split into individual authors
449
+ authors = re.split(r',\s*(?:and\s+)?|\s+and\s+|\s*&\s*', candidate)
450
+ authors = [a.strip() for a in authors if a.strip() and len(a.strip()) > 2]
451
+ if authors:
452
+ return authors
453
+ return None
454
+
455
+
456
+ def extract_email(text: str) -> Optional[str]:
457
+ """Extract corresponding author email."""
458
+ pattern = r'[\w.-]+@[\w.-]+\.\w+'
459
+ emails = re.findall(pattern, text)
460
+ return emails[0] if emails else None
461
+
462
+
463
+ def extract_fields(text: str) -> ExtractedFields:
464
+ """Main extraction function combining all extractors."""
465
+ lines = [ln.strip() for ln in text.split('\n') if ln.strip()]
466
+
467
+ # Extract all fields
468
+ doi = extract_doi(text)
469
+ identifiers = extract_identifiers(text)
470
+ pub_info = extract_publication_info(text)
471
+ dates = extract_dates(text)
472
+ title = extract_title(lines)
473
+ authors = extract_authors(text, lines, title)
474
+ abstract = extract_abstract(text)
475
+ keywords = extract_keywords(text)
476
+ email = extract_email(text)
477
+
478
+ return ExtractedFields(
479
+ doi=doi,
480
+ issn=identifiers.get('issn'),
481
+ volume=pub_info.get('volume'),
482
+ issue=pub_info.get('issue'),
483
+ pages=pub_info.get('pages'),
484
+ year=pub_info.get('year'),
485
+ received=dates.get('received'),
486
+ accepted=dates.get('accepted'),
487
+ published=dates.get('published'),
488
+ title=title,
489
+ authors=authors,
490
+ abstract=abstract,
491
+ keywords=keywords,
492
+ email=email,
493
+ )
494
+
495
+
496
+ # ============================================================================
497
+ # Main Processing Function
498
+ # ============================================================================
499
+
500
+ def process_image(
501
+ image_path: str,
502
+ output_text_path: Optional[str] = None,
503
+ output_json_path: Optional[str] = None,
504
+ scale_factor: float = 1.5,
505
+ psms: List[int] = None,
506
+ verbose: bool = True
507
+ ) -> Tuple[str, ExtractedFields, OcrCandidate]:
508
+ """
509
+ Process a document image and extract text and structured fields.
510
+
511
+ Args:
512
+ image_path: Path to the input image file
513
+ output_text_path: Optional path to save extracted text
514
+ output_json_path: Optional path to save extracted fields as JSON
515
+ scale_factor: Scale factor for image upscaling
516
+ psms: List of Tesseract page segmentation modes to try
517
+ verbose: Whether to print progress information
518
+
519
+ Returns:
520
+ Tuple of (cleaned_text, extracted_fields, best_ocr_candidate)
521
+ """
522
+ if psms is None:
523
+ psms = [3, 4, 6, 11]
524
+
525
+ # Load image
526
+ bgr = cv2.imread(image_path)
527
+ if bgr is None:
528
+ raise ValueError(f'Failed to read image: {image_path}')
529
+
530
+ rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
531
+
532
+ if verbose:
533
+ print(f'Processing image: {image_path}')
534
+ print(f'Image size: {rgb.shape[1]}x{rgb.shape[0]}')
535
+
536
+ # Run agentic OCR
537
+ best = run_agent(rgb, psms=psms, scale_factor=scale_factor, verbose=verbose)
538
+
539
+ if verbose:
540
+ print(f'\nSelected: {best.variant} | PSM={best.psm} | conf={best.avg_conf:.1f} | score={best.score:.1f}')
541
+
542
+ # Process text
543
+ cleaned_text = process_ocr_text(best.text)
544
+
545
+ # Extract fields
546
+ fields = extract_fields(cleaned_text)
547
+
548
+ # Save outputs if paths provided
549
+ if output_text_path:
550
+ with open(output_text_path, 'w', encoding='utf-8') as f:
551
+ f.write(cleaned_text)
552
+ if verbose:
553
+ print(f'Saved text: {output_text_path}')
554
+
555
+ if output_json_path:
556
+ with open(output_json_path, 'w', encoding='utf-8') as f:
557
+ json.dump(fields.to_dict(), f, indent=2, ensure_ascii=False)
558
+ if verbose:
559
+ print(f'Saved JSON: {output_json_path}')
560
+
561
+ return cleaned_text, fields, best
562
+
563
+
564
+ # ============================================================================
565
+ # CLI Entry Point
566
+ # ============================================================================
567
+
568
+ def main():
569
+ """Command-line interface for the OCR extractor."""
570
+ parser = argparse.ArgumentParser(
571
+ description='Lightweight Agentic OCR Document Extraction',
572
+ formatter_class=argparse.RawDescriptionHelpFormatter,
573
+ epilog='''
574
+ Examples:
575
+ python agentic_ocr_extractor.py document.jpg
576
+ python agentic_ocr_extractor.py document.png -o output.txt -j fields.json
577
+ python agentic_ocr_extractor.py scan.jpg --scale 2.0 --psm 3 6 11
578
+ '''
579
+ )
580
+
581
+ parser.add_argument('image', help='Path to the input image file')
582
+ parser.add_argument('-o', '--output-text', help='Path to save extracted text')
583
+ parser.add_argument('-j', '--output-json', help='Path to save extracted fields as JSON')
584
+ parser.add_argument('--scale', type=float, default=1.5, help='Scale factor for upscaling (default: 1.5)')
585
+ parser.add_argument('--psm', type=int, nargs='+', default=[3, 4, 6, 11],
586
+ help='Tesseract PSM modes to try (default: 3 4 6 11)')
587
+ parser.add_argument('-q', '--quiet', action='store_true', help='Suppress progress output')
588
+
589
+ args = parser.parse_args()
590
+
591
+ if not os.path.exists(args.image):
592
+ print(f'Error: Image file not found: {args.image}')
593
+ return 1
594
+
595
+ try:
596
+ cleaned_text, fields, best = process_image(
597
+ args.image,
598
+ output_text_path=args.output_text,
599
+ output_json_path=args.output_json,
600
+ scale_factor=args.scale,
601
+ psms=args.psm,
602
+ verbose=not args.quiet
603
+ )
604
+
605
+ # Print extracted fields
606
+ print('\nExtracted Fields:')
607
+ print(json.dumps(fields.to_dict(), indent=2, ensure_ascii=False))
608
+
609
+ return 0
610
+
611
+ except Exception as e:
612
+ print(f'Error: {e}')
613
+ return 1
614
+
615
+
616
+ if __name__ == '__main__':
617
+ exit(main())