zeroshotGPU / zsgdp /normalize /normalize_unstructured.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
"""Normalize Unstructured partition elements into the canonical parse schema."""
from __future__ import annotations
from typing import Any, Iterable
from zsgdp.normalize.markdown import normalize_markdown_candidate
from zsgdp.schema import DocumentProfile, ParseCandidate
def normalize_unstructured_parts(
*,
parts: Iterable[Any],
profile: DocumentProfile,
source_path: str,
) -> ParseCandidate:
markdown = _parts_to_markdown(parts)
candidate = normalize_markdown_candidate(
markdown=markdown,
doc_id=profile.doc_id,
source_path=source_path,
file_type=profile.file_type,
parser_name="unstructured",
confidence=0.78,
provenance={"backend": "unstructured", "normalizer": "normalize_unstructured_parts"},
)
return candidate
def _parts_to_markdown(parts: Iterable[Any]) -> str:
lines: list[str] = []
current_page: int | None = None
for part in parts:
text = str(part).strip()
if not text:
continue
page_num = _part_page_num(part)
if page_num and page_num != current_page:
current_page = page_num
if lines:
lines.append("")
lines.append(f"<!-- page:{page_num} -->")
lines.append("")
lines.append(_part_to_markdown(part, text))
lines.append("")
return "\n".join(lines).strip() + ("\n" if lines else "")
def _part_to_markdown(part: Any, text: str) -> str:
category = _part_category(part).lower()
if category in {"title", "header"} and not text.startswith("#"):
return f"# {text}"
if category in {"table"}:
html = _part_metadata_value(part, "text_as_html")
return str(html).strip() if html else text
return text
def _part_category(part: Any) -> str:
category = getattr(part, "category", None)
if category:
return str(category)
return part.__class__.__name__
def _part_page_num(part: Any) -> int | None:
value = _part_metadata_value(part, "page_number")
try:
return int(value) if value is not None else None
except (TypeError, ValueError):
return None
def _part_metadata_value(part: Any, key: str) -> Any:
metadata = getattr(part, "metadata", None)
if metadata is None:
return None
if isinstance(metadata, dict):
return metadata.get(key)
return getattr(metadata, key, None)