File size: 4,135 Bytes
2e1a095 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | from __future__ import annotations
import argparse
import json
import sys
from dataclasses import asdict, dataclass
from pathlib import Path
import fitz
ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from app import main
@dataclass
class PageScore:
page: int
score: float
characters: int
arabic_words: int
ink_ratio: float
def page_ink_ratio(page: fitz.Page, zoom: float = 0.25) -> float:
pixmap = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False, colorspace=fitz.csGRAY)
data = bytes(pixmap.samples)
if not data:
return 0.0
sorted_values = sorted(data)
background = sorted_values[int((len(sorted_values) - 1) * 0.95)]
threshold = max(0, background - 25)
dark_pixels = sum(1 for value in data if value < threshold)
return dark_pixels / len(data)
def score_page(page: fitz.Page, page_number: int) -> PageScore:
text = main.clean_arabic_text(page.get_text("text", sort=True))
arabic_words = main.ARABIC_RE.findall(text)
ink_ratio = page_ink_ratio(page)
score = len(text) + (len(arabic_words) * 12) + (ink_ratio * 800)
return PageScore(
page=page_number,
score=round(score, 2),
characters=len(text),
arabic_words=len(arabic_words),
ink_ratio=round(ink_ratio, 4),
)
def select_pages(pdf_path: Path, count: int = 5, skip_first: int = 0) -> list[PageScore]:
if count < 1:
raise ValueError("count must be at least 1")
if skip_first < 0:
raise ValueError("skip_first cannot be negative")
with fitz.open(pdf_path) as document:
scores = [
score_page(page, index + 1)
for index, page in enumerate(document)
if index >= skip_first
]
selected = sorted(sorted(scores, key=lambda item: item.score, reverse=True)[:count], key=lambda item: item.page)
return selected
def write_selected_pdf(source_pdf: Path, destination: Path, selected: list[PageScore]) -> None:
destination.parent.mkdir(parents=True, exist_ok=True)
with fitz.open(source_pdf) as source:
output = fitz.open()
for item in selected:
output.insert_pdf(source, from_page=item.page - 1, to_page=item.page - 1)
output.save(destination)
def build_test_pdf(source_pdf: Path, destination: Path, count: int = 5, skip_first: int = 0) -> dict[str, object]:
if not source_pdf.exists():
raise FileNotFoundError(f"PDF not found: {source_pdf}")
if source_pdf.suffix.lower() != ".pdf":
raise ValueError("Input must be a PDF file.")
selected = select_pages(source_pdf, count=count, skip_first=skip_first)
write_selected_pdf(source_pdf, destination, selected)
return {
"source": str(source_pdf),
"output": str(destination),
"pages": [item.page for item in selected],
"scores": [asdict(item) for item in selected],
}
def main_cli() -> None:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
parser = argparse.ArgumentParser(description="Create a small Arabic PDF sample from the most informative pages.")
parser.add_argument("pdf", type=Path, help="Source Arabic PDF")
parser.add_argument("--out", type=Path, help="Output sample PDF path")
parser.add_argument("--count", type=int, default=5, help="Number of pages to select")
parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages before scoring")
parser.add_argument("--json", action="store_true", help="Print JSON details")
args = parser.parse_args()
output = args.out or args.pdf.with_name(f"{args.pdf.stem}-best-{args.count}-pages.pdf")
result = build_test_pdf(args.pdf, output, count=args.count, skip_first=args.skip_first)
if args.json:
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
print(f"Wrote {result['output']}")
print(f"Selected pages: {', '.join(str(page) for page in result['pages'])}")
if __name__ == "__main__":
main_cli()
|