from __future__ import annotations import argparse import os import sys from pathlib import Path os.environ.setdefault("FLAGS_use_mkldnn", "0") def extract_text(result) -> list[str]: lines: list[str] = [] if isinstance(result, dict): for key in ("rec_texts", "texts"): value = result.get(key) if isinstance(value, list): lines.extend(str(item).strip() for item in value if str(item).strip()) return lines if isinstance(result, list): for item in result: lines.extend(extract_text(item)) return lines def main() -> None: if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8", errors="replace") if hasattr(sys.stderr, "reconfigure"): sys.stderr.reconfigure(encoding="utf-8", errors="replace") parser = argparse.ArgumentParser(description="Extract Arabic text from page images with PaddleOCR.") parser.add_argument("--image-dir", required=True, type=Path) parser.add_argument("--out", required=True, type=Path) args = parser.parse_args() image_paths = sorted(args.image_dir.glob("*.png")) total = max(len(image_paths), 1) print(f"ARABIC_READER_PROGRESS 0 {total}", flush=True) from paddleocr import PaddleOCR ocr = PaddleOCR( lang="ar", ocr_version="PP-OCRv5", use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False, ) pieces: list[str] = [] for index, image_path in enumerate(image_paths, start=1): result = ocr.predict(str(image_path)) page_text = "\n".join(extract_text(result)) if page_text.strip(): pieces.append(page_text.strip()) print(f"ARABIC_READER_PROGRESS {index} {total}", flush=True) args.out.parent.mkdir(parents=True, exist_ok=True) args.out.write_text("\n\n".join(pieces), encoding="utf-8") if __name__ == "__main__": main()