"""Schema.org JSON-LD Validator — Hugging Face Space. Paste any JSON-LD or fetch from a URL, validate the syntax and key Schema.org @graph structure. Built by: https://www.thatdevpro.com (ThatDevPro) Companion to: https://github.com/Janady13/aio-surfaces """ from __future__ import annotations import json import re import urllib.parse import urllib.request from typing import Any import gradio as gr UA = "schema-validator/1.0 (+https://huggingface.co/spaces/Janady07/schema-validator)" def _fetch_html(url: str) -> str: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=15) as r: return r.read().decode("utf-8", "ignore") def _extract_jsonld_from_html(html: str) -> list[str]: """Pull all ', re.I | re.S, ) return [m.group(1).strip() for m in pattern.finditer(html)] def _validate_jsonld(text: str) -> dict[str, Any]: """Parse + run basic Schema.org @graph validation.""" result: dict[str, Any] = { "valid_json": False, "errors": [], "warnings": [], "stats": {}, } try: data = json.loads(text) result["valid_json"] = True except json.JSONDecodeError as e: result["errors"].append(f"JSON parse error: {e}") return result # Check @context if isinstance(data, dict): nodes = [] if "@graph" in data: nodes = data["@graph"] if not isinstance(data.get("@context"), (str, dict, list)): result["warnings"].append( "Missing top-level @context — should be 'https://schema.org' or equivalent" ) if (isinstance(data.get("@context"), str) and "schema.org" not in data["@context"]): result["warnings"].append( f"@context is {data['@context']!r}, expected to include 'schema.org'" ) else: nodes = [data] elif isinstance(data, list): nodes = data else: result["errors"].append(f"Top-level must be object or array, got {type(data).__name__}") return result # Per-node checks types_seen = [] ids_seen = [] same_as_count = 0 identifier_count = 0 for i, node in enumerate(nodes): if not isinstance(node, dict): result["warnings"].append(f"Node {i}: not an object (got {type(node).__name__})") continue t = node.get("@type") nid = node.get("@id") if t is None: result["warnings"].append(f"Node {i}: missing @type") else: types_seen.append(t if isinstance(t, str) else (",".join(t) if isinstance(t, list) else str(t))) if nid: ids_seen.append(nid) if "sameAs" in node: sa = node["sameAs"] same_as_count += len(sa) if isinstance(sa, list) else 1 if "identifier" in node: ids_field = node["identifier"] identifier_count += len(ids_field) if isinstance(ids_field, list) else 1 result["stats"] = { "node_count": len(nodes), "types_seen": types_seen, "node_ids": ids_seen, "sameAs_total": same_as_count, "identifier_total": identifier_count, } # Best-practice nudges if "Organization" in str(types_seen) and same_as_count < 3: result["warnings"].append( "Organization has < 3 sameAs entries — recommend at least 3 trusted profiles " "(Wikidata, Crunchbase, LinkedIn, GitHub) for entity disambiguation" ) if "Person" in str(types_seen) and identifier_count == 0: result["warnings"].append( "Person node has no identifier — recommend at least one (ORCID, googleKgMID, wikidata)" ) return result def validate(input_text: str, mode: str) -> tuple[str, str]: """Returns (status_message, validation_report_json).""" input_text = (input_text or "").strip() if not input_text: return "Provide JSON-LD text or a URL.", "" if mode == "URL": url = input_text if not url.startswith(("http://", "https://")): url = "https://" + url try: html = _fetch_html(url) except Exception as e: return f"Could not fetch {url}: {e}", "" blocks = _extract_jsonld_from_html(html) if not blocks: return f"No