| { |
| "$schema": "http://json-schema.org/draft-07/schema#", |
| "title": "GeoQuery Data Catalog Entry", |
| "description": "Schema for dataset metadata in the GeoQuery platform catalog", |
| "type": "object", |
| "required": [ |
| "path", |
| "columns", |
| "category", |
| "format" |
| ], |
| "properties": { |
| "path": { |
| "type": "string", |
| "description": "Relative path to the data file from the data directory" |
| }, |
| "description": { |
| "type": "string", |
| "description": "Auto-generated basic description (e.g., 'Data from hdx/health.geojson')" |
| }, |
| "semantic_description": { |
| "type": [ |
| "string", |
| "null" |
| ], |
| "description": "LLM-generated rich description explaining the dataset's contents and use cases" |
| }, |
| "tags": { |
| "type": "array", |
| "items": { |
| "type": "string" |
| }, |
| "description": "Searchable tags for categorization (e.g., ['health', 'facilities', 'infrastructure'])" |
| }, |
| "data_type": { |
| "type": "string", |
| "enum": [ |
| "static", |
| "semi-static", |
| "realtime" |
| ], |
| "description": "How frequently the data changes", |
| "default": "static" |
| }, |
| "update_frequency": { |
| "type": [ |
| "string", |
| "null" |
| ], |
| "enum": [ |
| null, |
| "yearly", |
| "monthly", |
| "weekly", |
| "daily", |
| "hourly", |
| "realtime" |
| ], |
| "description": "Expected update frequency for the dataset" |
| }, |
| "columns": { |
| "type": "array", |
| "items": { |
| "type": "string" |
| }, |
| "description": "List of column names in the dataset" |
| }, |
| "row_count": { |
| "type": [ |
| "integer", |
| "null" |
| ], |
| "description": "Number of features/rows in the dataset" |
| }, |
| "category": { |
| "type": "string", |
| "description": "Source category (base, osm, hdx, inec, custom)" |
| }, |
| "format": { |
| "type": "string", |
| "enum": [ |
| "geojson", |
| "shapefile", |
| "geoparquet", |
| "csv" |
| ], |
| "description": "File format of the source data" |
| }, |
| "geometry_type": { |
| "type": [ |
| "string", |
| "null" |
| ], |
| "enum": [ |
| null, |
| "Point", |
| "MultiPoint", |
| "LineString", |
| "MultiLineString", |
| "Polygon", |
| "MultiPolygon" |
| ], |
| "description": "Type of geometries in the dataset" |
| }, |
| "bbox": { |
| "type": [ |
| "array", |
| "null" |
| ], |
| "items": { |
| "type": "number" |
| }, |
| "minItems": 4, |
| "maxItems": 4, |
| "description": "Bounding box [minLon, minLat, maxLon, maxLat]" |
| }, |
| "source": { |
| "type": [ |
| "string", |
| "null" |
| ], |
| "description": "Original source of the data (e.g., 'OpenStreetMap', 'INEC Census 2023')" |
| }, |
| "license": { |
| "type": [ |
| "string", |
| "null" |
| ], |
| "description": "Data license (e.g., 'ODbL', 'CC-BY-4.0', 'Public Domain')" |
| }, |
| "last_indexed": { |
| "type": "string", |
| "format": "date-time", |
| "description": "ISO timestamp when the dataset was last indexed" |
| }, |
| "last_enriched": { |
| "type": [ |
| "string", |
| "null" |
| ], |
| "format": "date-time", |
| "description": "ISO timestamp when LLM enrichment was last run" |
| } |
| } |
| } |