Spaces:

Syncre
/

arabic-audio-reader-worker

Running

App Files Files Community

arabic-audio-reader-worker / scripts /paddleocr_vl_extract.py

Syncre

Deploy Arabic Audio Reader worker

2e1a095 verified 1 day ago

raw

history blame contribute delete

3.49 kB

	from __future__ import annotations

	import argparse
	import html
	import re
	import sys
	from pathlib import Path
	from typing import Any


	TAG_RE = re.compile(r"<[^>]+>")


	def text_from_html(value: str) -> str:
	return html.unescape(TAG_RE.sub("\n", value))


	def extract_text(value: Any) -> list[str]:
	lines: list[str] = []
	if value is None:
	return lines
	if isinstance(value, str):
	return [line.strip() for line in text_from_html(value).splitlines() if line.strip()]
	if isinstance(value, dict):
	for key in ("text", "markdown", "html", "content"):
	item = value.get(key)
	if isinstance(item, str):
	lines.extend(extract_text(item))
	for key in ("res", "blocks", "text_lines", "children", "items", "pages"):
	item = value.get(key)
	if item is not None:
	lines.extend(extract_text(item))
	return lines
	if isinstance(value, (list, tuple)):
	for item in value:
	lines.extend(extract_text(item))
	return lines

	for attribute in ("text", "markdown", "html", "content", "res", "blocks", "text_lines", "children", "items", "pages"):
	if hasattr(value, attribute):
	lines.extend(extract_text(getattr(value, attribute)))
	if hasattr(value, "model_dump"):
	lines.extend(extract_text(value.model_dump()))
	elif hasattr(value, "dict"):
	lines.extend(extract_text(value.dict()))
	elif hasattr(value, "json"):
	try:
	lines.extend(extract_text(value.json))
	except Exception:
	pass
	return lines


	def main() -> None:
	if hasattr(sys.stdout, "reconfigure"):
	sys.stdout.reconfigure(encoding="utf-8", errors="replace")
	if hasattr(sys.stderr, "reconfigure"):
	sys.stderr.reconfigure(encoding="utf-8", errors="replace")

	parser = argparse.ArgumentParser(description="Extract text from page images with PaddleOCR-VL.")
	parser.add_argument("--image-dir", required=True, type=Path)
	parser.add_argument("--out", required=True, type=Path)
	parser.add_argument("--pipeline-version", default="v1.6")
	parser.add_argument("--vl-rec-backend", help="Optional PaddleOCR-VL backend, for example vllm-server.")
	parser.add_argument("--vl-rec-server-url", help="Optional VLM server URL for --vl-rec-backend.")
	args = parser.parse_args()

	image_paths = sorted(args.image_dir.glob("*.png"))
	total = max(len(image_paths), 1)
	print(f"ARABIC_READER_PROGRESS 0 {total}", flush=True)

	from paddleocr import PaddleOCRVL

	kwargs: dict[str, str] = {"pipeline_version": args.pipeline_version}
	if args.vl_rec_backend:
	kwargs["vl_rec_backend"] = args.vl_rec_backend
	if args.vl_rec_server_url:
	kwargs["vl_rec_server_url"] = args.vl_rec_server_url
	pipeline = PaddleOCRVL(**kwargs)

	pieces: list[str] = []
	image_paths = sorted(args.image_dir.glob("*.png"))
	total = max(len(image_paths), 1)
	for index, image_path in enumerate(image_paths, start=1):
	output = pipeline.predict(str(image_path))
	page_lines = [line.strip() for line in extract_text(output) if line.strip()]
	if page_lines:
	pieces.append("\n".join(page_lines))
	print(f"ARABIC_READER_PROGRESS {index} {total}", flush=True)

	args.out.parent.mkdir(parents=True, exist_ok=True)
	args.out.write_text("\n\n".join(pieces), encoding="utf-8")


	if __name__ == "__main__":
	main()