Spaces:
Sleeping
Sleeping
File size: 7,766 Bytes
fc66ad6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | """Schema.org JSON-LD Validator — Hugging Face Space.
Paste any JSON-LD or fetch from a URL, validate the syntax and
key Schema.org @graph structure.
Built by: https://www.thatdevpro.com (ThatDevPro)
Companion to: https://github.com/Janady13/aio-surfaces
"""
from __future__ import annotations
import json
import re
import urllib.parse
import urllib.request
from typing import Any
import gradio as gr
UA = "schema-validator/1.0 (+https://huggingface.co/spaces/Janady07/schema-validator)"
def _fetch_html(url: str) -> str:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=15) as r:
return r.read().decode("utf-8", "ignore")
def _extract_jsonld_from_html(html: str) -> list[str]:
"""Pull all <script type=application/ld+json> blocks."""
pattern = re.compile(
r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
re.I | re.S,
)
return [m.group(1).strip() for m in pattern.finditer(html)]
def _validate_jsonld(text: str) -> dict[str, Any]:
"""Parse + run basic Schema.org @graph validation."""
result: dict[str, Any] = {
"valid_json": False,
"errors": [],
"warnings": [],
"stats": {},
}
try:
data = json.loads(text)
result["valid_json"] = True
except json.JSONDecodeError as e:
result["errors"].append(f"JSON parse error: {e}")
return result
# Check @context
if isinstance(data, dict):
nodes = []
if "@graph" in data:
nodes = data["@graph"]
if not isinstance(data.get("@context"), (str, dict, list)):
result["warnings"].append(
"Missing top-level @context — should be 'https://schema.org' or equivalent"
)
if (isinstance(data.get("@context"), str) and
"schema.org" not in data["@context"]):
result["warnings"].append(
f"@context is {data['@context']!r}, expected to include 'schema.org'"
)
else:
nodes = [data]
elif isinstance(data, list):
nodes = data
else:
result["errors"].append(f"Top-level must be object or array, got {type(data).__name__}")
return result
# Per-node checks
types_seen = []
ids_seen = []
same_as_count = 0
identifier_count = 0
for i, node in enumerate(nodes):
if not isinstance(node, dict):
result["warnings"].append(f"Node {i}: not an object (got {type(node).__name__})")
continue
t = node.get("@type")
nid = node.get("@id")
if t is None:
result["warnings"].append(f"Node {i}: missing @type")
else:
types_seen.append(t if isinstance(t, str) else (",".join(t) if isinstance(t, list) else str(t)))
if nid:
ids_seen.append(nid)
if "sameAs" in node:
sa = node["sameAs"]
same_as_count += len(sa) if isinstance(sa, list) else 1
if "identifier" in node:
ids_field = node["identifier"]
identifier_count += len(ids_field) if isinstance(ids_field, list) else 1
result["stats"] = {
"node_count": len(nodes),
"types_seen": types_seen,
"node_ids": ids_seen,
"sameAs_total": same_as_count,
"identifier_total": identifier_count,
}
# Best-practice nudges
if "Organization" in str(types_seen) and same_as_count < 3:
result["warnings"].append(
"Organization has < 3 sameAs entries — recommend at least 3 trusted profiles "
"(Wikidata, Crunchbase, LinkedIn, GitHub) for entity disambiguation"
)
if "Person" in str(types_seen) and identifier_count == 0:
result["warnings"].append(
"Person node has no identifier — recommend at least one (ORCID, googleKgMID, wikidata)"
)
return result
def validate(input_text: str, mode: str) -> tuple[str, str]:
"""Returns (status_message, validation_report_json)."""
input_text = (input_text or "").strip()
if not input_text:
return "Provide JSON-LD text or a URL.", ""
if mode == "URL":
url = input_text
if not url.startswith(("http://", "https://")):
url = "https://" + url
try:
html = _fetch_html(url)
except Exception as e:
return f"Could not fetch {url}: {e}", ""
blocks = _extract_jsonld_from_html(html)
if not blocks:
return f"No <script type=\"application/ld+json\"> blocks found at {url}", ""
all_reports = []
for i, block in enumerate(blocks):
r = _validate_jsonld(block)
r["block_index"] = i
all_reports.append(r)
ok = sum(1 for r in all_reports if r["valid_json"] and not r["errors"])
status = (
f"Found {len(blocks)} JSON-LD block(s) at {url}. "
f"{ok} parsed cleanly, "
f"{sum(1 for r in all_reports if r['errors'])} with errors, "
f"{sum(len(r['warnings']) for r in all_reports)} warnings total."
)
return status, json.dumps(all_reports, indent=2)
# Direct paste mode
report = _validate_jsonld(input_text)
status_lines = ["Direct JSON-LD validation:"]
if not report["valid_json"]:
status_lines.append("❌ JSON is not parseable")
elif report["errors"]:
status_lines.append(f"❌ {len(report['errors'])} errors")
else:
status_lines.append("✓ JSON parses cleanly")
if report["warnings"]:
status_lines.append(f"⚠ {len(report['warnings'])} warnings (see report)")
if report.get("stats"):
s = report["stats"]
status_lines.append(
f"Stats: {s.get('node_count', 0)} node(s), "
f"{s.get('sameAs_total', 0)} sameAs entries, "
f"{s.get('identifier_total', 0)} identifier entries"
)
return "\n".join(status_lines), json.dumps(report, indent=2)
with gr.Blocks(
title="Schema.org JSON-LD Validator",
theme=gr.themes.Soft(primary_hue="green"),
) as demo:
gr.Markdown(
"""
# Schema.org JSON-LD Validator
Validate a Schema.org JSON-LD block — either paste it directly, or
give a URL and we'll fetch all `<script type="application/ld+json">`
blocks from the page and check each.
Best-practice checks include:
- JSON parses
- `@context` is `schema.org`
- Every node has `@type` and `@id`
- `Organization` has ≥ 3 `sameAs` entries
- `Person` has at least one `identifier` (ORCID / Wikidata / Google KG MID)
"""
)
with gr.Row():
mode = gr.Radio(
choices=["URL", "Paste JSON-LD"],
value="URL",
label="Input mode",
scale=1,
)
inp = gr.Textbox(
label="URL or JSON-LD",
placeholder="https://www.thatdevpro.com OR paste JSON-LD…",
lines=8,
scale=3,
)
btn = gr.Button("Validate", variant="primary")
status = gr.Textbox(label="Status", lines=4, interactive=False)
out = gr.Code(label="Validation report", language="json", lines=24)
btn.click(fn=validate, inputs=[inp, mode], outputs=[status, out])
gr.Markdown(
"""
---
Built by **[ThatDevPro](https://www.thatdevpro.com)** ·
Companion to **[aio-surfaces](https://github.com/Janady13/aio-surfaces)** (the toolkit that generates Schema.org @graph from a typed site config) ·
Source for this validator: open issue if interested.
"""
)
if __name__ == "__main__":
demo.launch()
|