"""Dataset-specific logic for the Text2SPARQL repair pipeline. Handles loading dataset configs, KG profiles, and endpoint URLs. All dataset-specific hacks remain inside this file. """ from __future__ import annotations import json import logging from pathlib import Path from .config import RuntimeConfig from .models import DatasetConfig logger = logging.getLogger(__name__) def load_dataset_config(dataset_id: str, config: RuntimeConfig) -> DatasetConfig: """Load dataset configuration from the runtime config. Args: dataset_id: The dataset identifier URL. config: The runtime configuration containing dataset definitions. Returns: DatasetConfig for the requested dataset. Raises: ValueError: If dataset_id is not found in the config. """ if dataset_id not in config.datasets: available = list(config.datasets.keys()) raise ValueError( f"Unknown dataset_id: {dataset_id!r}. " f"Available datasets: {available}" ) ds_raw = config.datasets[dataset_id] return DatasetConfig( dataset_id=dataset_id, endpoint_url=ds_raw["endpoint_url"], kg_profile_path=ds_raw.get("kg_profile_path", ""), default_prefixes=ds_raw.get("default_prefixes", {}), mode=ds_raw.get("mode", "dbpedia"), ) def load_kg_profile(dataset: DatasetConfig) -> dict: """Load the KG profile for a dataset. The KG profile contains entity labels, property labels, and class labels used for context building. Returns an empty profile structure if the profile file does not exist yet. Args: dataset: The dataset configuration. Returns: Dictionary with keys: entities, properties, classes. """ empty_profile = { "entities": [], "properties": [], "classes": [], "metadata": { "dataset_id": dataset.dataset_id, "mode": dataset.mode, }, } profile_path = Path(dataset.kg_profile_path) if not profile_path.exists(): logger.warning( "KG profile not found at %s — using empty profile. " "Context building will rely on prefix hints only.", dataset.kg_profile_path, ) return empty_profile try: with open(profile_path, "r") as f: profile = json.load(f) # Ensure required keys exist for key in ("entities", "properties", "classes"): if key not in profile: profile[key] = [] return profile except (json.JSONDecodeError, OSError) as exc: logger.error("Failed to load KG profile from %s: %s", profile_path, exc) return empty_profile def get_endpoint_url(dataset: DatasetConfig) -> str: """Get the SPARQL endpoint URL for a dataset. Args: dataset: The dataset configuration. Returns: The endpoint URL string. """ return dataset.endpoint_url