File size: 1,974 Bytes
2e1a095 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | from __future__ import annotations
import argparse
import os
import sys
from pathlib import Path
os.environ.setdefault("FLAGS_use_mkldnn", "0")
def extract_text(result) -> list[str]:
lines: list[str] = []
if isinstance(result, dict):
for key in ("rec_texts", "texts"):
value = result.get(key)
if isinstance(value, list):
lines.extend(str(item).strip() for item in value if str(item).strip())
return lines
if isinstance(result, list):
for item in result:
lines.extend(extract_text(item))
return lines
def main() -> None:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
if hasattr(sys.stderr, "reconfigure"):
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
parser = argparse.ArgumentParser(description="Extract Arabic text from page images with PaddleOCR.")
parser.add_argument("--image-dir", required=True, type=Path)
parser.add_argument("--out", required=True, type=Path)
args = parser.parse_args()
image_paths = sorted(args.image_dir.glob("*.png"))
total = max(len(image_paths), 1)
print(f"ARABIC_READER_PROGRESS 0 {total}", flush=True)
from paddleocr import PaddleOCR
ocr = PaddleOCR(
lang="ar",
ocr_version="PP-OCRv5",
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
)
pieces: list[str] = []
for index, image_path in enumerate(image_paths, start=1):
result = ocr.predict(str(image_path))
page_text = "\n".join(extract_text(result))
if page_text.strip():
pieces.append(page_text.strip())
print(f"ARABIC_READER_PROGRESS {index} {total}", flush=True)
args.out.parent.mkdir(parents=True, exist_ok=True)
args.out.write_text("\n\n".join(pieces), encoding="utf-8")
if __name__ == "__main__":
main()
|