Spaces:

arjun10g
/

zeroshotGPU

Running on Zero

zeroshotGPU / zsgdp /profiling /document_profile.py

Arjunvir Singh

Initial commit: zeroshotGPU MVP with full eval surface

db06ffa about 1 month ago

2.77 kB

	"""Top-level document profiling entrypoint."""

	from __future__ import annotations

	from pathlib import Path

	from zsgdp.profiling.heuristics import (
	aggregate_document_labels,
	estimate_formula_density,
	estimate_table_density,
	estimate_text_quality,
	labels_for_page,
	)
	from zsgdp.profiling.pdf_profile import profile_pdf
	from zsgdp.schema import DocumentProfile, PageProfile
	from zsgdp.utils import document_id_for_path, file_type_from_path


	def profile_document(path: str \| Path) -> DocumentProfile:
	path_obj = Path(path)
	if not path_obj.exists():
	raise FileNotFoundError(path_obj)

	doc_id = document_id_for_path(path_obj)
	file_type = file_type_from_path(path_obj)
	if file_type == "pdf":
	return profile_pdf(path_obj, doc_id)
	if file_type in {"text", "markdown", "html"}:
	return _profile_text_like(path_obj, doc_id, file_type)
	return _profile_unknown(path_obj, doc_id, file_type)


	def _profile_text_like(path: Path, doc_id: str, file_type: str) -> DocumentProfile:
	text = path.read_text(encoding="utf-8", errors="replace")
	table_density, table_count = estimate_table_density(text)
	formula_density = estimate_formula_density(text)
	lines = [line for line in text.splitlines() if line.strip()]
	char_count = len(text.strip())
	page = PageProfile(
	page_num=1,
	digital_text_chars=char_count,
	text_block_count=len(lines),
	avg_chars_per_text_block=(char_count / len(lines)) if lines else 0.0,
	table_density=table_density,
	table_candidate_count=table_count,
	formula_density=formula_density,
	digital_text_quality=estimate_text_quality(char_count, len(lines)),
	scanned_score=0.0,
	metadata={"line_count": len(lines)},
	)
	page.labels = labels_for_page(page)
	pages = [page]
	return DocumentProfile(
	doc_id=doc_id,
	source_path=str(path),
	file_type=file_type,
	page_count=1,
	extension=path.suffix.lower(),
	pages=pages,
	labels=aggregate_document_labels(pages),
	metadata={"profiler": "text_like"},
	)


	def _profile_unknown(path: Path, doc_id: str, file_type: str) -> DocumentProfile:
	page = PageProfile(
	page_num=1,
	scanned_score=0.5 if file_type == "image" else 0.0,
	digital_text_quality=0.0,
	labels=["low_confidence"],
	metadata={"warning": "No native profiler implemented for this file type yet."},
	)
	return DocumentProfile(
	doc_id=doc_id,
	source_path=str(path),
	file_type=file_type,
	page_count=1,
	extension=path.suffix.lower(),
	pages=[page],
	labels=["low_confidence"],
	metadata={"profiler": "unknown"},
	)