zeroshotGPU / zsgdp /profiling /document_profile.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
"""Top-level document profiling entrypoint."""
from __future__ import annotations
from pathlib import Path
from zsgdp.profiling.heuristics import (
aggregate_document_labels,
estimate_formula_density,
estimate_table_density,
estimate_text_quality,
labels_for_page,
)
from zsgdp.profiling.pdf_profile import profile_pdf
from zsgdp.schema import DocumentProfile, PageProfile
from zsgdp.utils import document_id_for_path, file_type_from_path
def profile_document(path: str | Path) -> DocumentProfile:
path_obj = Path(path)
if not path_obj.exists():
raise FileNotFoundError(path_obj)
doc_id = document_id_for_path(path_obj)
file_type = file_type_from_path(path_obj)
if file_type == "pdf":
return profile_pdf(path_obj, doc_id)
if file_type in {"text", "markdown", "html"}:
return _profile_text_like(path_obj, doc_id, file_type)
return _profile_unknown(path_obj, doc_id, file_type)
def _profile_text_like(path: Path, doc_id: str, file_type: str) -> DocumentProfile:
text = path.read_text(encoding="utf-8", errors="replace")
table_density, table_count = estimate_table_density(text)
formula_density = estimate_formula_density(text)
lines = [line for line in text.splitlines() if line.strip()]
char_count = len(text.strip())
page = PageProfile(
page_num=1,
digital_text_chars=char_count,
text_block_count=len(lines),
avg_chars_per_text_block=(char_count / len(lines)) if lines else 0.0,
table_density=table_density,
table_candidate_count=table_count,
formula_density=formula_density,
digital_text_quality=estimate_text_quality(char_count, len(lines)),
scanned_score=0.0,
metadata={"line_count": len(lines)},
)
page.labels = labels_for_page(page)
pages = [page]
return DocumentProfile(
doc_id=doc_id,
source_path=str(path),
file_type=file_type,
page_count=1,
extension=path.suffix.lower(),
pages=pages,
labels=aggregate_document_labels(pages),
metadata={"profiler": "text_like"},
)
def _profile_unknown(path: Path, doc_id: str, file_type: str) -> DocumentProfile:
page = PageProfile(
page_num=1,
scanned_score=0.5 if file_type == "image" else 0.0,
digital_text_quality=0.0,
labels=["low_confidence"],
metadata={"warning": "No native profiler implemented for this file type yet."},
)
return DocumentProfile(
doc_id=doc_id,
source_path=str(path),
file_type=file_type,
page_count=1,
extension=path.suffix.lower(),
pages=[page],
labels=["low_confidence"],
metadata={"profiler": "unknown"},
)