"""Top-level document profiling entrypoint.""" from __future__ import annotations from pathlib import Path from zsgdp.profiling.heuristics import ( aggregate_document_labels, estimate_formula_density, estimate_table_density, estimate_text_quality, labels_for_page, ) from zsgdp.profiling.pdf_profile import profile_pdf from zsgdp.schema import DocumentProfile, PageProfile from zsgdp.utils import document_id_for_path, file_type_from_path def profile_document(path: str | Path) -> DocumentProfile: path_obj = Path(path) if not path_obj.exists(): raise FileNotFoundError(path_obj) doc_id = document_id_for_path(path_obj) file_type = file_type_from_path(path_obj) if file_type == "pdf": return profile_pdf(path_obj, doc_id) if file_type in {"text", "markdown", "html"}: return _profile_text_like(path_obj, doc_id, file_type) return _profile_unknown(path_obj, doc_id, file_type) def _profile_text_like(path: Path, doc_id: str, file_type: str) -> DocumentProfile: text = path.read_text(encoding="utf-8", errors="replace") table_density, table_count = estimate_table_density(text) formula_density = estimate_formula_density(text) lines = [line for line in text.splitlines() if line.strip()] char_count = len(text.strip()) page = PageProfile( page_num=1, digital_text_chars=char_count, text_block_count=len(lines), avg_chars_per_text_block=(char_count / len(lines)) if lines else 0.0, table_density=table_density, table_candidate_count=table_count, formula_density=formula_density, digital_text_quality=estimate_text_quality(char_count, len(lines)), scanned_score=0.0, metadata={"line_count": len(lines)}, ) page.labels = labels_for_page(page) pages = [page] return DocumentProfile( doc_id=doc_id, source_path=str(path), file_type=file_type, page_count=1, extension=path.suffix.lower(), pages=pages, labels=aggregate_document_labels(pages), metadata={"profiler": "text_like"}, ) def _profile_unknown(path: Path, doc_id: str, file_type: str) -> DocumentProfile: page = PageProfile( page_num=1, scanned_score=0.5 if file_type == "image" else 0.0, digital_text_quality=0.0, labels=["low_confidence"], metadata={"warning": "No native profiler implemented for this file type yet."}, ) return DocumentProfile( doc_id=doc_id, source_path=str(path), file_type=file_type, page_count=1, extension=path.suffix.lower(), pages=[page], labels=["low_confidence"], metadata={"profiler": "unknown"}, )