| from __future__ import annotations |
|
|
| import argparse |
| import html |
| import re |
| import sys |
| from pathlib import Path |
| from typing import Any |
|
|
|
|
| TAG_RE = re.compile(r"<[^>]+>") |
|
|
|
|
| def text_from_html(value: str) -> str: |
| return html.unescape(TAG_RE.sub("\n", value)) |
|
|
|
|
| def extract_text(value: Any) -> list[str]: |
| lines: list[str] = [] |
| if value is None: |
| return lines |
| if isinstance(value, str): |
| return [line.strip() for line in text_from_html(value).splitlines() if line.strip()] |
| if isinstance(value, dict): |
| for key in ("text", "markdown", "html", "content"): |
| item = value.get(key) |
| if isinstance(item, str): |
| lines.extend(extract_text(item)) |
| for key in ("res", "blocks", "text_lines", "children", "items", "pages"): |
| item = value.get(key) |
| if item is not None: |
| lines.extend(extract_text(item)) |
| return lines |
| if isinstance(value, (list, tuple)): |
| for item in value: |
| lines.extend(extract_text(item)) |
| return lines |
|
|
| for attribute in ("text", "markdown", "html", "content", "res", "blocks", "text_lines", "children", "items", "pages"): |
| if hasattr(value, attribute): |
| lines.extend(extract_text(getattr(value, attribute))) |
| if hasattr(value, "model_dump"): |
| lines.extend(extract_text(value.model_dump())) |
| elif hasattr(value, "dict"): |
| lines.extend(extract_text(value.dict())) |
| elif hasattr(value, "json"): |
| try: |
| lines.extend(extract_text(value.json)) |
| except Exception: |
| pass |
| return lines |
|
|
|
|
| def main() -> None: |
| if hasattr(sys.stdout, "reconfigure"): |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") |
| if hasattr(sys.stderr, "reconfigure"): |
| sys.stderr.reconfigure(encoding="utf-8", errors="replace") |
|
|
| parser = argparse.ArgumentParser(description="Extract text from page images with PaddleOCR-VL.") |
| parser.add_argument("--image-dir", required=True, type=Path) |
| parser.add_argument("--out", required=True, type=Path) |
| parser.add_argument("--pipeline-version", default="v1.6") |
| parser.add_argument("--vl-rec-backend", help="Optional PaddleOCR-VL backend, for example vllm-server.") |
| parser.add_argument("--vl-rec-server-url", help="Optional VLM server URL for --vl-rec-backend.") |
| args = parser.parse_args() |
|
|
| image_paths = sorted(args.image_dir.glob("*.png")) |
| total = max(len(image_paths), 1) |
| print(f"ARABIC_READER_PROGRESS 0 {total}", flush=True) |
|
|
| from paddleocr import PaddleOCRVL |
|
|
| kwargs: dict[str, str] = {"pipeline_version": args.pipeline_version} |
| if args.vl_rec_backend: |
| kwargs["vl_rec_backend"] = args.vl_rec_backend |
| if args.vl_rec_server_url: |
| kwargs["vl_rec_server_url"] = args.vl_rec_server_url |
| pipeline = PaddleOCRVL(**kwargs) |
|
|
| pieces: list[str] = [] |
| image_paths = sorted(args.image_dir.glob("*.png")) |
| total = max(len(image_paths), 1) |
| for index, image_path in enumerate(image_paths, start=1): |
| output = pipeline.predict(str(image_path)) |
| page_lines = [line.strip() for line in extract_text(output) if line.strip()] |
| if page_lines: |
| pieces.append("\n".join(page_lines)) |
| print(f"ARABIC_READER_PROGRESS {index} {total}", flush=True) |
|
|
| args.out.parent.mkdir(parents=True, exist_ok=True) |
| args.out.write_text("\n\n".join(pieces), encoding="utf-8") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|