File size: 4,135 Bytes
2e1a095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from __future__ import annotations

import argparse
import json
import sys
from dataclasses import asdict, dataclass
from pathlib import Path

import fitz

ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from app import main


@dataclass
class PageScore:
    page: int
    score: float
    characters: int
    arabic_words: int
    ink_ratio: float


def page_ink_ratio(page: fitz.Page, zoom: float = 0.25) -> float:
    pixmap = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False, colorspace=fitz.csGRAY)
    data = bytes(pixmap.samples)
    if not data:
        return 0.0
    sorted_values = sorted(data)
    background = sorted_values[int((len(sorted_values) - 1) * 0.95)]
    threshold = max(0, background - 25)
    dark_pixels = sum(1 for value in data if value < threshold)
    return dark_pixels / len(data)


def score_page(page: fitz.Page, page_number: int) -> PageScore:
    text = main.clean_arabic_text(page.get_text("text", sort=True))
    arabic_words = main.ARABIC_RE.findall(text)
    ink_ratio = page_ink_ratio(page)
    score = len(text) + (len(arabic_words) * 12) + (ink_ratio * 800)
    return PageScore(
        page=page_number,
        score=round(score, 2),
        characters=len(text),
        arabic_words=len(arabic_words),
        ink_ratio=round(ink_ratio, 4),
    )


def select_pages(pdf_path: Path, count: int = 5, skip_first: int = 0) -> list[PageScore]:
    if count < 1:
        raise ValueError("count must be at least 1")
    if skip_first < 0:
        raise ValueError("skip_first cannot be negative")
    with fitz.open(pdf_path) as document:
        scores = [
            score_page(page, index + 1)
            for index, page in enumerate(document)
            if index >= skip_first
        ]
    selected = sorted(sorted(scores, key=lambda item: item.score, reverse=True)[:count], key=lambda item: item.page)
    return selected


def write_selected_pdf(source_pdf: Path, destination: Path, selected: list[PageScore]) -> None:
    destination.parent.mkdir(parents=True, exist_ok=True)
    with fitz.open(source_pdf) as source:
        output = fitz.open()
        for item in selected:
            output.insert_pdf(source, from_page=item.page - 1, to_page=item.page - 1)
        output.save(destination)


def build_test_pdf(source_pdf: Path, destination: Path, count: int = 5, skip_first: int = 0) -> dict[str, object]:
    if not source_pdf.exists():
        raise FileNotFoundError(f"PDF not found: {source_pdf}")
    if source_pdf.suffix.lower() != ".pdf":
        raise ValueError("Input must be a PDF file.")
    selected = select_pages(source_pdf, count=count, skip_first=skip_first)
    write_selected_pdf(source_pdf, destination, selected)
    return {
        "source": str(source_pdf),
        "output": str(destination),
        "pages": [item.page for item in selected],
        "scores": [asdict(item) for item in selected],
    }


def main_cli() -> None:
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    parser = argparse.ArgumentParser(description="Create a small Arabic PDF sample from the most informative pages.")
    parser.add_argument("pdf", type=Path, help="Source Arabic PDF")
    parser.add_argument("--out", type=Path, help="Output sample PDF path")
    parser.add_argument("--count", type=int, default=5, help="Number of pages to select")
    parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages before scoring")
    parser.add_argument("--json", action="store_true", help="Print JSON details")
    args = parser.parse_args()

    output = args.out or args.pdf.with_name(f"{args.pdf.stem}-best-{args.count}-pages.pdf")
    result = build_test_pdf(args.pdf, output, count=args.count, skip_first=args.skip_first)
    if args.json:
        print(json.dumps(result, ensure_ascii=False, indent=2))
    else:
        print(f"Wrote {result['output']}")
        print(f"Selected pages: {', '.join(str(page) for page in result['pages'])}")


if __name__ == "__main__":
    main_cli()