File size: 7,442 Bytes

from __future__ import annotations

import csv
import json
import os
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path

try:
    from .config import IAB_TAXONOMY_GRAPH_PATH, IAB_TAXONOMY_PATH, IAB_TAXONOMY_VERSION  # type: ignore
except ImportError:
    from config import IAB_TAXONOMY_GRAPH_PATH, IAB_TAXONOMY_PATH, IAB_TAXONOMY_VERSION

_DEFAULT_MODEL_REPO_ID = "admesh/agentic-intent-classifier"


@dataclass(frozen=True)
class IabNode:
    unique_id: str
    parent_id: str | None
    label: str
    path: tuple[str, ...]

    @property
    def level(self) -> int:
        return len(self.path)

    @property
    def path_label(self) -> str:
        return path_to_label(self.path)


class IabTaxonomy:
    def __init__(self, nodes: list[IabNode]):
        self.nodes = nodes
        self._path_index = {node.path: node for node in nodes}
        self._children_index: dict[tuple[str, ...], list[IabNode]] = {}
        self._level_index: dict[int, list[IabNode]] = {}
        for node in nodes:
            self._children_index.setdefault(node.path[:-1], []).append(node)
            self._level_index.setdefault(node.level, []).append(node)
        for children in self._children_index.values():
            children.sort(key=lambda item: item.path)
        for level_nodes in self._level_index.values():
            level_nodes.sort(key=lambda item: item.path)

    def get_node(self, path: tuple[str, ...]) -> IabNode:
        if path not in self._path_index:
            raise KeyError(f"Unknown IAB path: {path}")
        return self._path_index[path]

    def build_level(self, path: tuple[str, ...]) -> dict:
        node = self.get_node(path)
        return {"id": node.unique_id, "label": node.label}

    def has_path(self, path: tuple[str, ...]) -> bool:
        return path in self._path_index

    def immediate_children(self, prefix: tuple[str, ...]) -> list[IabNode]:
        return list(self._children_index.get(prefix, []))

    def siblings(self, path: tuple[str, ...]) -> list[IabNode]:
        node = self.get_node(path)
        return [candidate for candidate in self._children_index.get(path[:-1], []) if candidate.path != node.path]

    def level_nodes(self, level: int) -> list[IabNode]:
        return list(self._level_index.get(level, []))

    def to_training_graph(self) -> dict:
        nodes = []
        for node in self.nodes:
            child_nodes = self.immediate_children(node.path)
            sibling_nodes = self.siblings(node.path)
            nodes.append(
                {
                    "node_id": node.unique_id,
                    "parent_id": node.parent_id,
                    "level": node.level,
                    "label": node.label,
                    "path": list(node.path),
                    "path_label": node.path_label,
                    "child_ids": [child.unique_id for child in child_nodes],
                    "child_paths": [child.path_label for child in child_nodes],
                    "sibling_ids": [sibling.unique_id for sibling in sibling_nodes],
                    "sibling_paths": [sibling.path_label for sibling in sibling_nodes],
                    "canonical_surface_name": node.label,
                }
            )
        return {
            "taxonomy": "IAB Content Taxonomy",
            "taxonomy_version": IAB_TAXONOMY_VERSION,
            "node_count": len(nodes),
            "level_counts": {
                f"tier{level}": len(self.level_nodes(level))
                for level in range(1, 5)
            },
            "nodes": nodes,
        }

    def build_content_object(self, path: tuple[str, ...], mapping_mode: str, mapping_confidence: float) -> dict:
        if not path:
            raise ValueError("IAB path must not be empty")

        payload = {
            "taxonomy": "IAB Content Taxonomy",
            "taxonomy_version": IAB_TAXONOMY_VERSION,
            "tier1": self.build_level(path[:1]),
            "mapping_mode": mapping_mode,
            "mapping_confidence": round(float(mapping_confidence), 4),
        }
        if len(path) >= 2:
            payload["tier2"] = self.build_level(path[:2])
        if len(path) >= 3:
            payload["tier3"] = self.build_level(path[:3])
        if len(path) >= 4:
            payload["tier4"] = self.build_level(path[:4])
        return payload

    def build_content_object_from_label(
        self,
        path_label: str,
        mapping_mode: str,
        mapping_confidence: float,
    ) -> dict:
        return self.build_content_object(
            path=parse_path_label(path_label),
            mapping_mode=mapping_mode,
            mapping_confidence=mapping_confidence,
        )


def parse_path_label(path_label: str) -> tuple[str, ...]:
    path = tuple(part.strip() for part in path_label.split(">") if part.strip())
    if not path:
        raise ValueError("IAB path label must not be empty")
    return path


def path_to_label(path: tuple[str, ...]) -> str:
    if not path:
        raise ValueError("IAB path must not be empty")
    return " > ".join(path)


def _load_rows(path: Path) -> list[dict]:
    with path.open("r", encoding="utf-8") as handle:
        reader = csv.reader(handle, delimiter="\t")
        rows = list(reader)

    header = rows[1]
    data_rows = rows[2:]
    parsed = []
    for row in data_rows:
        padded = row + [""] * (len(header) - len(row))
        parsed.append(dict(zip(header, padded)))
    return parsed


def _resolve_taxonomy_path() -> Path:
    """Resolve taxonomy TSV path for local and HF trust_remote_code environments."""
    if IAB_TAXONOMY_PATH.exists():
        return IAB_TAXONOMY_PATH

    # HF dynamic modules often do not contain non-Python data files.
    # Fetch the taxonomy TSV directly from the model repo as a fallback.
    repo_id = os.environ.get("ADMESH_MODEL_REPO_ID", _DEFAULT_MODEL_REPO_ID).strip() or _DEFAULT_MODEL_REPO_ID
    revision = os.environ.get("ADMESH_MODEL_REVISION", "").strip() or None
    filename = f"data/iab-content/Content Taxonomy {IAB_TAXONOMY_VERSION}.tsv"
    try:
        from huggingface_hub import hf_hub_download
    except ModuleNotFoundError as exc:
        raise FileNotFoundError(
            f"Taxonomy TSV missing at {IAB_TAXONOMY_PATH}; install huggingface_hub or provide local taxonomy file."
        ) from exc

    downloaded = hf_hub_download(
        repo_id=repo_id,
        repo_type="model",
        filename=filename,
        revision=revision,
    )
    return Path(downloaded)


@lru_cache(maxsize=1)
def get_iab_taxonomy() -> IabTaxonomy:
    nodes = []
    for row in _load_rows(_resolve_taxonomy_path()):
        path = tuple(
            value.strip()
            for key in ("Tier 1", "Tier 2", "Tier 3", "Tier 4")
            if (value := row.get(key, "").strip())
        )
        if not path:
            continue
        nodes.append(
            IabNode(
                unique_id=row["Unique ID"].strip(),
                parent_id=row["Parent"].strip() or None,
                label=row["Name"].strip(),
                path=path,
            )
        )
    return IabTaxonomy(nodes)


def write_training_graph(path: Path = IAB_TAXONOMY_GRAPH_PATH) -> Path:
    taxonomy = get_iab_taxonomy()
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(taxonomy.to_training_graph(), indent=2, sort_keys=True) + "\n", encoding="utf-8")
    return path