| from __future__ import annotations |
|
|
| import argparse |
| import sys |
| from pathlib import Path |
|
|
|
|
| def main() -> None: |
| if hasattr(sys.stdout, "reconfigure"): |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") |
| if hasattr(sys.stderr, "reconfigure"): |
| sys.stderr.reconfigure(encoding="utf-8", errors="replace") |
|
|
| parser = argparse.ArgumentParser(description="Extract Arabic text from page images with EasyOCR.") |
| parser.add_argument("--image-dir", required=True, type=Path) |
| parser.add_argument("--out", required=True, type=Path) |
| args = parser.parse_args() |
|
|
| image_paths = sorted(args.image_dir.glob("*.png")) |
| total = max(len(image_paths), 1) |
| print(f"ARABIC_READER_PROGRESS 0 {total}", flush=True) |
|
|
| import easyocr |
|
|
| reader = easyocr.Reader(["ar"], gpu=False, verbose=False) |
| pieces: list[str] = [] |
| for index, image_path in enumerate(image_paths, start=1): |
| lines = reader.readtext(str(image_path), detail=0, paragraph=True) |
| page_text = "\n".join(str(line).strip() for line in lines if str(line).strip()) |
| if page_text: |
| pieces.append(page_text) |
| print(f"ARABIC_READER_PROGRESS {index} {total}", flush=True) |
|
|
| args.out.parent.mkdir(parents=True, exist_ok=True) |
| args.out.write_text("\n\n".join(pieces), encoding="utf-8") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|