Spaces:
Running on Zero
Running on Zero
| """Top-level document profiling entrypoint.""" | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from zsgdp.profiling.heuristics import ( | |
| aggregate_document_labels, | |
| estimate_formula_density, | |
| estimate_table_density, | |
| estimate_text_quality, | |
| labels_for_page, | |
| ) | |
| from zsgdp.profiling.pdf_profile import profile_pdf | |
| from zsgdp.schema import DocumentProfile, PageProfile | |
| from zsgdp.utils import document_id_for_path, file_type_from_path | |
| def profile_document(path: str | Path) -> DocumentProfile: | |
| path_obj = Path(path) | |
| if not path_obj.exists(): | |
| raise FileNotFoundError(path_obj) | |
| doc_id = document_id_for_path(path_obj) | |
| file_type = file_type_from_path(path_obj) | |
| if file_type == "pdf": | |
| return profile_pdf(path_obj, doc_id) | |
| if file_type in {"text", "markdown", "html"}: | |
| return _profile_text_like(path_obj, doc_id, file_type) | |
| return _profile_unknown(path_obj, doc_id, file_type) | |
| def _profile_text_like(path: Path, doc_id: str, file_type: str) -> DocumentProfile: | |
| text = path.read_text(encoding="utf-8", errors="replace") | |
| table_density, table_count = estimate_table_density(text) | |
| formula_density = estimate_formula_density(text) | |
| lines = [line for line in text.splitlines() if line.strip()] | |
| char_count = len(text.strip()) | |
| page = PageProfile( | |
| page_num=1, | |
| digital_text_chars=char_count, | |
| text_block_count=len(lines), | |
| avg_chars_per_text_block=(char_count / len(lines)) if lines else 0.0, | |
| table_density=table_density, | |
| table_candidate_count=table_count, | |
| formula_density=formula_density, | |
| digital_text_quality=estimate_text_quality(char_count, len(lines)), | |
| scanned_score=0.0, | |
| metadata={"line_count": len(lines)}, | |
| ) | |
| page.labels = labels_for_page(page) | |
| pages = [page] | |
| return DocumentProfile( | |
| doc_id=doc_id, | |
| source_path=str(path), | |
| file_type=file_type, | |
| page_count=1, | |
| extension=path.suffix.lower(), | |
| pages=pages, | |
| labels=aggregate_document_labels(pages), | |
| metadata={"profiler": "text_like"}, | |
| ) | |
| def _profile_unknown(path: Path, doc_id: str, file_type: str) -> DocumentProfile: | |
| page = PageProfile( | |
| page_num=1, | |
| scanned_score=0.5 if file_type == "image" else 0.0, | |
| digital_text_quality=0.0, | |
| labels=["low_confidence"], | |
| metadata={"warning": "No native profiler implemented for this file type yet."}, | |
| ) | |
| return DocumentProfile( | |
| doc_id=doc_id, | |
| source_path=str(path), | |
| file_type=file_type, | |
| page_count=1, | |
| extension=path.suffix.lower(), | |
| pages=[page], | |
| labels=["low_confidence"], | |
| metadata={"profiler": "unknown"}, | |
| ) | |