Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 2,978 Bytes
d745844 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | """Dataset-specific logic for the Text2SPARQL repair pipeline.
Handles loading dataset configs, KG profiles, and endpoint URLs.
All dataset-specific hacks remain inside this file.
"""
from __future__ import annotations
import json
import logging
from pathlib import Path
from .config import RuntimeConfig
from .models import DatasetConfig
logger = logging.getLogger(__name__)
def load_dataset_config(dataset_id: str, config: RuntimeConfig) -> DatasetConfig:
"""Load dataset configuration from the runtime config.
Args:
dataset_id: The dataset identifier URL.
config: The runtime configuration containing dataset definitions.
Returns:
DatasetConfig for the requested dataset.
Raises:
ValueError: If dataset_id is not found in the config.
"""
if dataset_id not in config.datasets:
available = list(config.datasets.keys())
raise ValueError(
f"Unknown dataset_id: {dataset_id!r}. "
f"Available datasets: {available}"
)
ds_raw = config.datasets[dataset_id]
return DatasetConfig(
dataset_id=dataset_id,
endpoint_url=ds_raw["endpoint_url"],
kg_profile_path=ds_raw.get("kg_profile_path", ""),
default_prefixes=ds_raw.get("default_prefixes", {}),
mode=ds_raw.get("mode", "dbpedia"),
)
def load_kg_profile(dataset: DatasetConfig) -> dict:
"""Load the KG profile for a dataset.
The KG profile contains entity labels, property labels, and class labels
used for context building. Returns an empty profile structure if the
profile file does not exist yet.
Args:
dataset: The dataset configuration.
Returns:
Dictionary with keys: entities, properties, classes.
"""
empty_profile = {
"entities": [],
"properties": [],
"classes": [],
"metadata": {
"dataset_id": dataset.dataset_id,
"mode": dataset.mode,
},
}
profile_path = Path(dataset.kg_profile_path)
if not profile_path.exists():
logger.warning(
"KG profile not found at %s — using empty profile. "
"Context building will rely on prefix hints only.",
dataset.kg_profile_path,
)
return empty_profile
try:
with open(profile_path, "r") as f:
profile = json.load(f)
# Ensure required keys exist
for key in ("entities", "properties", "classes"):
if key not in profile:
profile[key] = []
return profile
except (json.JSONDecodeError, OSError) as exc:
logger.error("Failed to load KG profile from %s: %s", profile_path, exc)
return empty_profile
def get_endpoint_url(dataset: DatasetConfig) -> str:
"""Get the SPARQL endpoint URL for a dataset.
Args:
dataset: The dataset configuration.
Returns:
The endpoint URL string.
"""
return dataset.endpoint_url
|