Spaces:

Syncre
/

arabic-audio-reader-worker

Running

Deploy Arabic Audio Reader worker

2e1a095 verified 1 day ago

1.36 kB

	from __future__ import annotations

	import argparse
	import sys
	from pathlib import Path


	def main() -> None:
	if hasattr(sys.stdout, "reconfigure"):
	sys.stdout.reconfigure(encoding="utf-8", errors="replace")
	if hasattr(sys.stderr, "reconfigure"):
	sys.stderr.reconfigure(encoding="utf-8", errors="replace")

	parser = argparse.ArgumentParser(description="Extract Arabic text from page images with EasyOCR.")
	parser.add_argument("--image-dir", required=True, type=Path)
	parser.add_argument("--out", required=True, type=Path)
	args = parser.parse_args()

	image_paths = sorted(args.image_dir.glob("*.png"))
	total = max(len(image_paths), 1)
	print(f"ARABIC_READER_PROGRESS 0 {total}", flush=True)

	import easyocr

	reader = easyocr.Reader(["ar"], gpu=False, verbose=False)
	pieces: list[str] = []
	for index, image_path in enumerate(image_paths, start=1):
	lines = reader.readtext(str(image_path), detail=0, paragraph=True)
	page_text = "\n".join(str(line).strip() for line in lines if str(line).strip())
	if page_text:
	pieces.append(page_text)
	print(f"ARABIC_READER_PROGRESS {index} {total}", flush=True)

	args.out.parent.mkdir(parents=True, exist_ok=True)
	args.out.write_text("\n\n".join(pieces), encoding="utf-8")


	if __name__ == "__main__":
	main()