"""Normalize Unstructured partition elements into the canonical parse schema.""" from __future__ import annotations from typing import Any, Iterable from zsgdp.normalize.markdown import normalize_markdown_candidate from zsgdp.schema import DocumentProfile, ParseCandidate def normalize_unstructured_parts( *, parts: Iterable[Any], profile: DocumentProfile, source_path: str, ) -> ParseCandidate: markdown = _parts_to_markdown(parts) candidate = normalize_markdown_candidate( markdown=markdown, doc_id=profile.doc_id, source_path=source_path, file_type=profile.file_type, parser_name="unstructured", confidence=0.78, provenance={"backend": "unstructured", "normalizer": "normalize_unstructured_parts"}, ) return candidate def _parts_to_markdown(parts: Iterable[Any]) -> str: lines: list[str] = [] current_page: int | None = None for part in parts: text = str(part).strip() if not text: continue page_num = _part_page_num(part) if page_num and page_num != current_page: current_page = page_num if lines: lines.append("") lines.append(f"") lines.append("") lines.append(_part_to_markdown(part, text)) lines.append("") return "\n".join(lines).strip() + ("\n" if lines else "") def _part_to_markdown(part: Any, text: str) -> str: category = _part_category(part).lower() if category in {"title", "header"} and not text.startswith("#"): return f"# {text}" if category in {"table"}: html = _part_metadata_value(part, "text_as_html") return str(html).strip() if html else text return text def _part_category(part: Any) -> str: category = getattr(part, "category", None) if category: return str(category) return part.__class__.__name__ def _part_page_num(part: Any) -> int | None: value = _part_metadata_value(part, "page_number") try: return int(value) if value is not None else None except (TypeError, ValueError): return None def _part_metadata_value(part: Any, key: str) -> Any: metadata = getattr(part, "metadata", None) if metadata is None: return None if isinstance(metadata, dict): return metadata.get(key) return getattr(metadata, key, None)