arabic-audio-reader-worker / scripts /easyocr_extract.py
Syncre's picture
Deploy Arabic Audio Reader worker
2e1a095 verified
from __future__ import annotations
import argparse
import sys
from pathlib import Path
def main() -> None:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
if hasattr(sys.stderr, "reconfigure"):
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
parser = argparse.ArgumentParser(description="Extract Arabic text from page images with EasyOCR.")
parser.add_argument("--image-dir", required=True, type=Path)
parser.add_argument("--out", required=True, type=Path)
args = parser.parse_args()
image_paths = sorted(args.image_dir.glob("*.png"))
total = max(len(image_paths), 1)
print(f"ARABIC_READER_PROGRESS 0 {total}", flush=True)
import easyocr
reader = easyocr.Reader(["ar"], gpu=False, verbose=False)
pieces: list[str] = []
for index, image_path in enumerate(image_paths, start=1):
lines = reader.readtext(str(image_path), detail=0, paragraph=True)
page_text = "\n".join(str(line).strip() for line in lines if str(line).strip())
if page_text:
pieces.append(page_text)
print(f"ARABIC_READER_PROGRESS {index} {total}", flush=True)
args.out.parent.mkdir(parents=True, exist_ok=True)
args.out.write_text("\n\n".join(pieces), encoding="utf-8")
if __name__ == "__main__":
main()