from __future__ import annotations import argparse import json import sys from dataclasses import asdict, dataclass from pathlib import Path import fitz ROOT_DIR = Path(__file__).resolve().parent.parent if str(ROOT_DIR) not in sys.path: sys.path.insert(0, str(ROOT_DIR)) from app import main @dataclass class PageScore: page: int score: float characters: int arabic_words: int ink_ratio: float def page_ink_ratio(page: fitz.Page, zoom: float = 0.25) -> float: pixmap = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False, colorspace=fitz.csGRAY) data = bytes(pixmap.samples) if not data: return 0.0 sorted_values = sorted(data) background = sorted_values[int((len(sorted_values) - 1) * 0.95)] threshold = max(0, background - 25) dark_pixels = sum(1 for value in data if value < threshold) return dark_pixels / len(data) def score_page(page: fitz.Page, page_number: int) -> PageScore: text = main.clean_arabic_text(page.get_text("text", sort=True)) arabic_words = main.ARABIC_RE.findall(text) ink_ratio = page_ink_ratio(page) score = len(text) + (len(arabic_words) * 12) + (ink_ratio * 800) return PageScore( page=page_number, score=round(score, 2), characters=len(text), arabic_words=len(arabic_words), ink_ratio=round(ink_ratio, 4), ) def select_pages(pdf_path: Path, count: int = 5, skip_first: int = 0) -> list[PageScore]: if count < 1: raise ValueError("count must be at least 1") if skip_first < 0: raise ValueError("skip_first cannot be negative") with fitz.open(pdf_path) as document: scores = [ score_page(page, index + 1) for index, page in enumerate(document) if index >= skip_first ] selected = sorted(sorted(scores, key=lambda item: item.score, reverse=True)[:count], key=lambda item: item.page) return selected def write_selected_pdf(source_pdf: Path, destination: Path, selected: list[PageScore]) -> None: destination.parent.mkdir(parents=True, exist_ok=True) with fitz.open(source_pdf) as source: output = fitz.open() for item in selected: output.insert_pdf(source, from_page=item.page - 1, to_page=item.page - 1) output.save(destination) def build_test_pdf(source_pdf: Path, destination: Path, count: int = 5, skip_first: int = 0) -> dict[str, object]: if not source_pdf.exists(): raise FileNotFoundError(f"PDF not found: {source_pdf}") if source_pdf.suffix.lower() != ".pdf": raise ValueError("Input must be a PDF file.") selected = select_pages(source_pdf, count=count, skip_first=skip_first) write_selected_pdf(source_pdf, destination, selected) return { "source": str(source_pdf), "output": str(destination), "pages": [item.page for item in selected], "scores": [asdict(item) for item in selected], } def main_cli() -> None: if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8", errors="replace") parser = argparse.ArgumentParser(description="Create a small Arabic PDF sample from the most informative pages.") parser.add_argument("pdf", type=Path, help="Source Arabic PDF") parser.add_argument("--out", type=Path, help="Output sample PDF path") parser.add_argument("--count", type=int, default=5, help="Number of pages to select") parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages before scoring") parser.add_argument("--json", action="store_true", help="Print JSON details") args = parser.parse_args() output = args.out or args.pdf.with_name(f"{args.pdf.stem}-best-{args.count}-pages.pdf") result = build_test_pdf(args.pdf, output, count=args.count, skip_first=args.skip_first) if args.json: print(json.dumps(result, ensure_ascii=False, indent=2)) else: print(f"Wrote {result['output']}") print(f"Selected pages: {', '.join(str(page) for page in result['pages'])}") if __name__ == "__main__": main_cli()