from __future__ import annotations import csv import json import os from dataclasses import dataclass from functools import lru_cache from pathlib import Path try: from .config import IAB_TAXONOMY_GRAPH_PATH, IAB_TAXONOMY_PATH, IAB_TAXONOMY_VERSION # type: ignore except ImportError: from config import IAB_TAXONOMY_GRAPH_PATH, IAB_TAXONOMY_PATH, IAB_TAXONOMY_VERSION _DEFAULT_MODEL_REPO_ID = "admesh/agentic-intent-classifier" @dataclass(frozen=True) class IabNode: unique_id: str parent_id: str | None label: str path: tuple[str, ...] @property def level(self) -> int: return len(self.path) @property def path_label(self) -> str: return path_to_label(self.path) class IabTaxonomy: def __init__(self, nodes: list[IabNode]): self.nodes = nodes self._path_index = {node.path: node for node in nodes} self._children_index: dict[tuple[str, ...], list[IabNode]] = {} self._level_index: dict[int, list[IabNode]] = {} for node in nodes: self._children_index.setdefault(node.path[:-1], []).append(node) self._level_index.setdefault(node.level, []).append(node) for children in self._children_index.values(): children.sort(key=lambda item: item.path) for level_nodes in self._level_index.values(): level_nodes.sort(key=lambda item: item.path) def get_node(self, path: tuple[str, ...]) -> IabNode: if path not in self._path_index: raise KeyError(f"Unknown IAB path: {path}") return self._path_index[path] def build_level(self, path: tuple[str, ...]) -> dict: node = self.get_node(path) return {"id": node.unique_id, "label": node.label} def has_path(self, path: tuple[str, ...]) -> bool: return path in self._path_index def immediate_children(self, prefix: tuple[str, ...]) -> list[IabNode]: return list(self._children_index.get(prefix, [])) def siblings(self, path: tuple[str, ...]) -> list[IabNode]: node = self.get_node(path) return [candidate for candidate in self._children_index.get(path[:-1], []) if candidate.path != node.path] def level_nodes(self, level: int) -> list[IabNode]: return list(self._level_index.get(level, [])) def to_training_graph(self) -> dict: nodes = [] for node in self.nodes: child_nodes = self.immediate_children(node.path) sibling_nodes = self.siblings(node.path) nodes.append( { "node_id": node.unique_id, "parent_id": node.parent_id, "level": node.level, "label": node.label, "path": list(node.path), "path_label": node.path_label, "child_ids": [child.unique_id for child in child_nodes], "child_paths": [child.path_label for child in child_nodes], "sibling_ids": [sibling.unique_id for sibling in sibling_nodes], "sibling_paths": [sibling.path_label for sibling in sibling_nodes], "canonical_surface_name": node.label, } ) return { "taxonomy": "IAB Content Taxonomy", "taxonomy_version": IAB_TAXONOMY_VERSION, "node_count": len(nodes), "level_counts": { f"tier{level}": len(self.level_nodes(level)) for level in range(1, 5) }, "nodes": nodes, } def build_content_object(self, path: tuple[str, ...], mapping_mode: str, mapping_confidence: float) -> dict: if not path: raise ValueError("IAB path must not be empty") payload = { "taxonomy": "IAB Content Taxonomy", "taxonomy_version": IAB_TAXONOMY_VERSION, "tier1": self.build_level(path[:1]), "mapping_mode": mapping_mode, "mapping_confidence": round(float(mapping_confidence), 4), } if len(path) >= 2: payload["tier2"] = self.build_level(path[:2]) if len(path) >= 3: payload["tier3"] = self.build_level(path[:3]) if len(path) >= 4: payload["tier4"] = self.build_level(path[:4]) return payload def build_content_object_from_label( self, path_label: str, mapping_mode: str, mapping_confidence: float, ) -> dict: return self.build_content_object( path=parse_path_label(path_label), mapping_mode=mapping_mode, mapping_confidence=mapping_confidence, ) def parse_path_label(path_label: str) -> tuple[str, ...]: path = tuple(part.strip() for part in path_label.split(">") if part.strip()) if not path: raise ValueError("IAB path label must not be empty") return path def path_to_label(path: tuple[str, ...]) -> str: if not path: raise ValueError("IAB path must not be empty") return " > ".join(path) def _load_rows(path: Path) -> list[dict]: with path.open("r", encoding="utf-8") as handle: reader = csv.reader(handle, delimiter="\t") rows = list(reader) header = rows[1] data_rows = rows[2:] parsed = [] for row in data_rows: padded = row + [""] * (len(header) - len(row)) parsed.append(dict(zip(header, padded))) return parsed def _resolve_taxonomy_path() -> Path: """Resolve taxonomy TSV path for local and HF trust_remote_code environments.""" if IAB_TAXONOMY_PATH.exists(): return IAB_TAXONOMY_PATH # HF dynamic modules often do not contain non-Python data files. # Fetch the taxonomy TSV directly from the model repo as a fallback. repo_id = os.environ.get("ADMESH_MODEL_REPO_ID", _DEFAULT_MODEL_REPO_ID).strip() or _DEFAULT_MODEL_REPO_ID revision = os.environ.get("ADMESH_MODEL_REVISION", "").strip() or None filename = f"data/iab-content/Content Taxonomy {IAB_TAXONOMY_VERSION}.tsv" try: from huggingface_hub import hf_hub_download except ModuleNotFoundError as exc: raise FileNotFoundError( f"Taxonomy TSV missing at {IAB_TAXONOMY_PATH}; install huggingface_hub or provide local taxonomy file." ) from exc downloaded = hf_hub_download( repo_id=repo_id, repo_type="model", filename=filename, revision=revision, ) return Path(downloaded) @lru_cache(maxsize=1) def get_iab_taxonomy() -> IabTaxonomy: nodes = [] for row in _load_rows(_resolve_taxonomy_path()): path = tuple( value.strip() for key in ("Tier 1", "Tier 2", "Tier 3", "Tier 4") if (value := row.get(key, "").strip()) ) if not path: continue nodes.append( IabNode( unique_id=row["Unique ID"].strip(), parent_id=row["Parent"].strip() or None, label=row["Name"].strip(), path=path, ) ) return IabTaxonomy(nodes) def write_training_graph(path: Path = IAB_TAXONOMY_GRAPH_PATH) -> Path: taxonomy = get_iab_taxonomy() path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(taxonomy.to_training_graph(), indent=2, sort_keys=True) + "\n", encoding="utf-8") return path