| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| from dataclasses import asdict, dataclass |
| from pathlib import Path |
|
|
| import fitz |
|
|
| ROOT_DIR = Path(__file__).resolve().parent.parent |
| if str(ROOT_DIR) not in sys.path: |
| sys.path.insert(0, str(ROOT_DIR)) |
|
|
| from app import main |
|
|
|
|
| @dataclass |
| class PageScore: |
| page: int |
| score: float |
| characters: int |
| arabic_words: int |
| ink_ratio: float |
|
|
|
|
| def page_ink_ratio(page: fitz.Page, zoom: float = 0.25) -> float: |
| pixmap = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False, colorspace=fitz.csGRAY) |
| data = bytes(pixmap.samples) |
| if not data: |
| return 0.0 |
| sorted_values = sorted(data) |
| background = sorted_values[int((len(sorted_values) - 1) * 0.95)] |
| threshold = max(0, background - 25) |
| dark_pixels = sum(1 for value in data if value < threshold) |
| return dark_pixels / len(data) |
|
|
|
|
| def score_page(page: fitz.Page, page_number: int) -> PageScore: |
| text = main.clean_arabic_text(page.get_text("text", sort=True)) |
| arabic_words = main.ARABIC_RE.findall(text) |
| ink_ratio = page_ink_ratio(page) |
| score = len(text) + (len(arabic_words) * 12) + (ink_ratio * 800) |
| return PageScore( |
| page=page_number, |
| score=round(score, 2), |
| characters=len(text), |
| arabic_words=len(arabic_words), |
| ink_ratio=round(ink_ratio, 4), |
| ) |
|
|
|
|
| def select_pages(pdf_path: Path, count: int = 5, skip_first: int = 0) -> list[PageScore]: |
| if count < 1: |
| raise ValueError("count must be at least 1") |
| if skip_first < 0: |
| raise ValueError("skip_first cannot be negative") |
| with fitz.open(pdf_path) as document: |
| scores = [ |
| score_page(page, index + 1) |
| for index, page in enumerate(document) |
| if index >= skip_first |
| ] |
| selected = sorted(sorted(scores, key=lambda item: item.score, reverse=True)[:count], key=lambda item: item.page) |
| return selected |
|
|
|
|
| def write_selected_pdf(source_pdf: Path, destination: Path, selected: list[PageScore]) -> None: |
| destination.parent.mkdir(parents=True, exist_ok=True) |
| with fitz.open(source_pdf) as source: |
| output = fitz.open() |
| for item in selected: |
| output.insert_pdf(source, from_page=item.page - 1, to_page=item.page - 1) |
| output.save(destination) |
|
|
|
|
| def build_test_pdf(source_pdf: Path, destination: Path, count: int = 5, skip_first: int = 0) -> dict[str, object]: |
| if not source_pdf.exists(): |
| raise FileNotFoundError(f"PDF not found: {source_pdf}") |
| if source_pdf.suffix.lower() != ".pdf": |
| raise ValueError("Input must be a PDF file.") |
| selected = select_pages(source_pdf, count=count, skip_first=skip_first) |
| write_selected_pdf(source_pdf, destination, selected) |
| return { |
| "source": str(source_pdf), |
| "output": str(destination), |
| "pages": [item.page for item in selected], |
| "scores": [asdict(item) for item in selected], |
| } |
|
|
|
|
| def main_cli() -> None: |
| if hasattr(sys.stdout, "reconfigure"): |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") |
| parser = argparse.ArgumentParser(description="Create a small Arabic PDF sample from the most informative pages.") |
| parser.add_argument("pdf", type=Path, help="Source Arabic PDF") |
| parser.add_argument("--out", type=Path, help="Output sample PDF path") |
| parser.add_argument("--count", type=int, default=5, help="Number of pages to select") |
| parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages before scoring") |
| parser.add_argument("--json", action="store_true", help="Print JSON details") |
| args = parser.parse_args() |
|
|
| output = args.out or args.pdf.with_name(f"{args.pdf.stem}-best-{args.count}-pages.pdf") |
| result = build_test_pdf(args.pdf, output, count=args.count, skip_first=args.skip_first) |
| if args.json: |
| print(json.dumps(result, ensure_ascii=False, indent=2)) |
| else: |
| print(f"Wrote {result['output']}") |
| print(f"Selected pages: {', '.join(str(page) for page in result['pages'])}") |
|
|
|
|
| if __name__ == "__main__": |
| main_cli() |
|
|