| from __future__ import annotations |
|
|
| import argparse |
| import os |
| import sys |
| from pathlib import Path |
|
|
| os.environ.setdefault("FLAGS_use_mkldnn", "0") |
|
|
|
|
| def extract_text(result) -> list[str]: |
| lines: list[str] = [] |
| if isinstance(result, dict): |
| for key in ("rec_texts", "texts"): |
| value = result.get(key) |
| if isinstance(value, list): |
| lines.extend(str(item).strip() for item in value if str(item).strip()) |
| return lines |
| if isinstance(result, list): |
| for item in result: |
| lines.extend(extract_text(item)) |
| return lines |
|
|
|
|
| def main() -> None: |
| if hasattr(sys.stdout, "reconfigure"): |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") |
| if hasattr(sys.stderr, "reconfigure"): |
| sys.stderr.reconfigure(encoding="utf-8", errors="replace") |
|
|
| parser = argparse.ArgumentParser(description="Extract Arabic text from page images with PaddleOCR.") |
| parser.add_argument("--image-dir", required=True, type=Path) |
| parser.add_argument("--out", required=True, type=Path) |
| args = parser.parse_args() |
|
|
| image_paths = sorted(args.image_dir.glob("*.png")) |
| total = max(len(image_paths), 1) |
| print(f"ARABIC_READER_PROGRESS 0 {total}", flush=True) |
|
|
| from paddleocr import PaddleOCR |
|
|
| ocr = PaddleOCR( |
| lang="ar", |
| ocr_version="PP-OCRv5", |
| use_doc_orientation_classify=False, |
| use_doc_unwarping=False, |
| use_textline_orientation=False, |
| ) |
| pieces: list[str] = [] |
| for index, image_path in enumerate(image_paths, start=1): |
| result = ocr.predict(str(image_path)) |
| page_text = "\n".join(extract_text(result)) |
| if page_text.strip(): |
| pieces.append(page_text.strip()) |
| print(f"ARABIC_READER_PROGRESS {index} {total}", flush=True) |
|
|
| args.out.parent.mkdir(parents=True, exist_ok=True) |
| args.out.write_text("\n\n".join(pieces), encoding="utf-8") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|