Spaces:

BramVanroy
/

bcql_py_validation

Running

App Files Files Community

bcql_py_validation / app.py

BramVanroy

Sync from GitHub via hub-sync

b6496cb verified 6 days ago

raw

history blame contribute delete

16.6 kB

	"""Gradio demo for ``bcql_py`` query validation.

	A small, illustrative web UI that lets users paste a BlackLab Corpus Query Language
	(BCQL) query, optionally pick or customize a [CorpusSpec][bcql_py.validation.CorpusSpec],
	and inspect parsing / validation results in real time.

	Run locally with::

	uv sync --group app
	uv run python app/app.py

	The same script powers the hosted demo on Hugging Face Spaces.
	"""

	from __future__ import annotations

	from typing import Any

	import gradio as gr

	from bcql_py import (
	BCQLSyntaxError,
	BCQLValidationError,
	CorpusSpec,
	parse,
	)
	from bcql_py.validation.presets import LASSY, UD


	PRESETS: dict[str, CorpusSpec \| None] = {
	"None (permissive)": None,
	"Universal Dependencies (UD)": UD,
	"Lassy / Alpino": LASSY,
	}

	EXAMPLES: list[list[str]] = [
	['"man"', "None (permissive)"],
	['[lemma="search" & pos="NOUN"]', "Universal Dependencies (UD)"],
	['"the" [pos="ADJ"]+ "man"', "Universal Dependencies (UD)"],
	['<s/> containing "fluffy"', "None (permissive)"],
	['"baker" within <ne type="PERS"/>', "None (permissive)"],
	['[pos="VERB"] -nsubj-> [pos="NOUN"]', "Universal Dependencies (UD)"],
	['A:[pos="ADJ"] "man"', "Universal Dependencies (UD)"],
	['[pos="NOUN" &]', "None (permissive)"],
	['[pos="BANANA"]', "Universal Dependencies (UD)"],
	['[unknownattr="x"]', "Universal Dependencies (UD)"],
	]

	TAB_LABELS: tuple[str, ...] = (
	"AST (JSON)",
	"Active spec",
	)


	# Indigo (mkdocs-material primary) shades:
	INDIGO = gr.themes.Color(
	name="indigo",
	c50="#e8eaf6",
	c100="#c5cae9",
	c200="#9fa8da",
	c300="#7986cb",
	c400="#5c6bc0",
	c500="#3f51b5",
	c600="#3949ab",
	c700="#303f9f",
	c800="#283593",
	c900="#1a237e",
	c950="#0d1442",
	)

	THEME = gr.themes.Soft(
	primary_hue=INDIGO,
	secondary_hue=INDIGO,
	neutral_hue="slate",
	font=(
	gr.themes.GoogleFont("Roboto"),
	"ui-sans-serif",
	"system-ui",
	"sans-serif",
	),
	font_mono=(
	gr.themes.GoogleFont("Roboto Mono"),
	"ui-monospace",
	"Consolas",
	"monospace",
	),
	).set(
	body_background_fill="*neutral_50",
	body_background_fill_dark="*neutral_950",
	block_radius="*radius_lg",
	button_primary_background_fill="*primary_500",
	button_primary_background_fill_hover="*primary_600",
	button_primary_text_color="white",
	)

	# Theming: https://www.gradio.app/guides/theming-guide#extending-themes-via-set
	CUSTOM_CSS = """
	.bcql-header {
	text-align: center;
	padding: 1.25rem 0 0.5rem 0;
	}
	.bcql-header h1 {
	font-weight: 700;
	letter-spacing: -0.02em;
	margin: 0;
	}
	.bcql-header p {
	color: var(--body-text-color-subdued);
	margin-top: 0.4rem;
	}
	.bcql-status-ok {
	background: linear-gradient(90deg, #e8f5e9 0%, #f1f8e9 100%);
	border-left: 4px solid #2e7d32;
	border-radius: 8px;
	padding: 0.85rem 1rem;
	color: #1b5e20;
	font-weight: 600;
	}
	.bcql-status-err {
	background: linear-gradient(90deg, #ffebee 0%, #fff3e0 100%);
	border-left: 4px solid #c62828;
	border-radius: 8px;
	padding: 0.85rem 1rem;
	color: #b71c1c;
	font-weight: 600;
	}
	.dark .bcql-status-ok {
	background: rgba(46, 125, 50, 0.15);
	color: #a5d6a7;
	}
	.dark .bcql-status-err {
	background: rgba(198, 40, 40, 0.18);
	color: #ef9a9a;
	}
	.bcql-footer {
	text-align: center;
	color: var(--body-text-color-subdued);
	padding: 1rem 0 0.5rem 0;
	font-size: 0.9rem;
	}
	"""

	EMPTY_AST: dict[str, Any] = {}
	EMPTY_STATUS: str = (
	'<div class="bcql-status-ok" style="opacity: 0.6;">'
	"Enter a BCQL query above and click <b>Validate query</b>.</div>"
	)


	def _parse_csv(value: str) -> list[str]:
	"""Split a comma/whitespace-separated string into a clean list of names."""
	if not value:
	return []
	parts: list[str] = []
	for chunk in value.replace("\n", ",").split(","):
	chunk = chunk.strip()
	if chunk:
	parts.append(chunk)
	return parts


	def _parse_closed_attributes(text: str) -> dict[str, list[str]]:
	"""Parse a textarea of ``key: val1, val2`` lines into a dict.

	Lines starting with ``#`` and blank lines are ignored. Raises ``ValueError``
	on malformed lines so the UI can show a friendly error.
	"""
	result: dict[str, list[str]] = {}
	for raw in (text or "").splitlines():
	line = raw.strip()
	if not line or line.startswith("#"):
	continue
	if ":" not in line:
	raise ValueError(
	f"Closed attributes line must look like 'key: val1, val2': {raw!r}"
	)
	key, _, values = line.partition(":")
	key = key.strip()
	if not key:
	raise ValueError(f"Empty annotation name in line: {raw!r}")
	result[key] = _parse_csv(values)
	return result


	def _build_custom_spec(
	open_attrs: str,
	closed_attrs: str,
	strict: bool,
	allow_alignment: bool,
	allow_relations: bool,
	span_tags: str,
	relations: str,
	) -> CorpusSpec:
	"""Construct a [CorpusSpec][bcql_py.validation.CorpusSpec] from the custom-spec form fields."""
	closed = _parse_closed_attributes(closed_attrs)
	span_tag_list = _parse_csv(span_tags)
	relation_list = _parse_csv(relations)
	return CorpusSpec(
	open_attributes=frozenset(_parse_csv(open_attrs)),
	closed_attributes={k: frozenset(v) for k, v in closed.items()},
	strict_attributes=strict,
	allow_alignment=allow_alignment,
	allow_relations=allow_relations,
	allowed_span_tags=frozenset(span_tag_list) if span_tag_list else None,
	allowed_relations=frozenset(relation_list) if relation_list else None,
	)


	def _format_syntax_error(error: BCQLSyntaxError) -> str:
	"""Format a [BCQLSyntaxError][bcql_py.exceptions.BCQLSyntaxError] as a fenced markdown snippet with caret."""
	lines = ["Syntax error", "", f"> {error.message}"]
	if error.query and error.position is not None:
	lines.extend(
	[
	"",
	"```text",
	error.query,
	" " * error.position + "^",
	"```",
	]
	)
	return "\n".join(lines)


	def _format_validation_error(error: BCQLValidationError) -> str:
	"""Format a [BCQLValidationError][bcql_py.exceptions.BCQLValidationError] as a markdown bullet list."""
	if len(error.issues) == 1:
	issue = error.issues[0]
	parts = [
	"Validation error",
	"",
	f"- {issue.kind}: {issue.message}",
	]
	if issue.context:
	parts.append("")
	parts.append("Context:")
	for key, val in issue.context.items():
	parts.append(f" - {key}: {val!r}")
	return "\n".join(parts)

	parts = [f"Found {len(error.issues)} validation issue(s):", ""]
	for issue in error.issues:
	parts.append(f"- {issue.kind}: {issue.message}")
	if issue.context:
	ctx = ", ".join(f"`{k}={v!r}`" for k, v in issue.context.items())
	parts.append(f" - {ctx}")
	return "\n".join(parts)


	def _ok_html(message: str) -> str:
	return f'<div class="bcql-status-ok">✅ {message}</div>'


	def _err_html(message: str) -> str:
	return f'<div class="bcql-status-err">❌ {message}</div>'


	def validate_query(
	query: str,
	preset_name: str,
	use_custom: bool,
	custom_open: str,
	custom_closed: str,
	custom_strict: bool,
	custom_allow_alignment: bool,
	custom_allow_relations: bool,
	custom_span_tags: str,
	custom_relations: str,
	fail_fast: bool,
	) -> tuple[str, str, dict[str, Any]]:
	"""Run the parser/validator and return UI-ready values.

	Returns:
	Tuple of ``(status_html, error_markdown, ast_dict, canonical_bcql)``.
	``ast_dict`` is a plain dict; empty when no AST was produced.
	"""
	query = (query or "").strip()
	if not query:
	return (EMPTY_STATUS, "", EMPTY_AST)

	spec: CorpusSpec \| None
	if use_custom:
	try:
	spec = _build_custom_spec(
	custom_open,
	custom_closed,
	custom_strict,
	custom_allow_alignment,
	custom_allow_relations,
	custom_span_tags,
	custom_relations,
	)
	except ValueError as exc:
	return (
	_err_html("Invalid custom corpus spec."),
	f"Custom spec error\n\n> {exc}",
	EMPTY_AST,
	)
	else:
	spec = PRESETS.get(preset_name)

	try:
	ast = parse(query, spec=spec, fail_fast=fail_fast)
	except BCQLSyntaxError as exc:
	return (
	_err_html("Query failed to parse."),
	_format_syntax_error(exc),
	EMPTY_AST,
	)
	except BCQLValidationError as exc:
	# Try to also surface the parsed AST when validation (not parsing) fails:
	try:
	ast_only = parse(query)
	ast_dict = ast_only.model_dump(mode="json")
	except Exception:
	ast_dict = EMPTY_AST
	return (
	_err_html(
	"Query syntactically parses but does not match the corpus spec."
	),
	_format_validation_error(exc),
	ast_dict,
	)

	label = (
	"Query is syntactically valid."
	if spec is None
	else (
	"Query is valid against the selected corpus spec "
	f"({'custom' if use_custom else preset_name})."
	)
	)
	return (_ok_html(label), "", ast.model_dump(mode="json"))


	def validate_example(
	query: str, preset_name: str
	) -> tuple[str, str, dict[str, Any]]:
	"""Wrapper around validate_query for ``gr.Examples`` (2-arg signature)."""
	return validate_query(
	query, preset_name, False, "", "", False, True, True, "", "", False
	)


	def render_spec_description(preset_name: str) -> str:
	"""Markdown description of the selected preset's [CorpusSpec][bcql_py.validation.CorpusSpec]."""
	spec = PRESETS.get(preset_name)
	if spec is None:
	return (
	"_No corpus spec selected: only syntactic parsing is performed._\n\n"
	"Pick a preset or enable Custom corpus spec to also run "
	"semantic validation against a corpus vocabulary."
	)
	return spec.description


	INTRO_HTML = """
	<div class="bcql-header">
	<h1>BCQL Validator</h1>
	<p>Parse and validate <strong><a href="https://blacklab.ivdnt.org/" target="_blank">BlackLab</a></strong>
	Corpus Query Language queries with
	<strong><a href="https://bramvanroy.github.io/bcql_py/" target="_blank"><code>bcql_py</code></a></strong></p>
	</div>
	"""

	ABOUT_MARKDOWN = """
	## About

	`bcql_py` is a Python parser for the BlackLab Corpus Query Language. It produces
	a frozen Pydantic AST that round-trips back to BCQL or JSON, and ships an
	optional semantic validation layer driven by a `CorpusSpec`.

	This demo lets you:

	- Parse any BCQL query and see the resulting AST as JSON.
	- Validate the query against a built-in preset (`UD`, `Lassy`) or a
	custom corpus spec you define on the fly.

	When parsing or validation fails, the error message points at the offending
	position in the query: useful both for humans and for LLM-driven feedback
	loops.

	Links: [GitHub](https://github.com/BramVanroy/bcql_py) ·
	[Documentation](https://bramvanroy.github.io/bcql_py/) ·
	[BCQL cheatsheet](https://bramvanroy.github.io/bcql_py/guides/cheatsheet/)
	"""

	with gr.Blocks(
	title="BCQL Validator",
	analytics_enabled=False,
	) as demo:
	# Heading
	gr.HTML(INTRO_HTML)

	# Sidebar with about text
	with gr.Sidebar(position="right"):
	gr.Markdown(ABOUT_MARKDOWN)

	# Main content: query input and results
	with gr.Row(equal_height=False):
	# Main column for input (query, spec, examples)
	with gr.Column(scale=5):
	query_input = gr.Textbox(
	label="BCQL query",
	value='[lemma="search" & pos="NOUN"]',
	lines=4,
	max_lines=12,
	placeholder='[pos="NOUN"] "the" [pos="ADJ"]+ "man" ...',
	)

	with gr.Row():
	preset_dropdown = gr.Dropdown(
	choices=list(PRESETS.keys()),
	value="None (permissive)",
	label="Corpus preset",
	scale=3,
	)
	fail_fast = gr.Checkbox(
	value=False,
	label="Fail fast",
	info="Stop at the first issue instead of collecting all.",
	scale=2,
	)

	with gr.Accordion("Custom corpus spec (advanced)", open=False):
	use_custom = gr.Checkbox(
	value=False,
	label="Use custom spec instead of preset",
	)
	custom_open = gr.Textbox(
	label="Open attributes",
	placeholder="word, lemma, xpos",
	info="Comma-separated annotation names with unconstrained values.",
	)
	custom_closed = gr.Textbox(
	label="Closed attributes",
	placeholder="pos: NOUN, VERB, ADJ\nNumber: Sing, Plur",
	info="One per line: 'name: value1, value2, ...'.",
	lines=4,
	)
	with gr.Row():
	custom_strict = gr.Checkbox(
	value=False,
	label="Strict attributes",
	info="Reject any annotation not listed above.",
	)
	custom_allow_alignment = gr.Checkbox(
	value=True, label="Allow alignment (==>)"
	)
	custom_allow_relations = gr.Checkbox(
	value=True, label="Allow relations (-->)"
	)
	custom_span_tags = gr.Textbox(
	label="Allowed span tags",
	placeholder="s, p, ne",
	info="Leave empty to allow any tag.",
	)
	custom_relations = gr.Textbox(
	label="Allowed relation labels",
	placeholder="nsubj, obj, amod",
	info="Leave empty to allow any relation label.",
	)

	validate_btn = gr.Button(
	"Validate query", variant="primary", size="lg"
	)

	# Column for outputs (status, error, AST, canonical BCQL)
	with gr.Column(scale=4):
	status_box = gr.HTML(value=EMPTY_STATUS, label="Status")
	error_md = gr.Markdown(
	value="",
	label="Error details",
	latex_delimiters=None,
	)

	with gr.Tabs():
	with gr.Tab(TAB_LABELS[0], render_children=True):
	ast_output = gr.JSON(value=EMPTY_AST, label="")
	with gr.Tab(TAB_LABELS[1], render_children=True):
	spec_md = gr.Markdown(
	value=render_spec_description("None (permissive)"),
	)
	inputs = [
	query_input,
	preset_dropdown,
	use_custom,
	custom_open,
	custom_closed,
	custom_strict,
	custom_allow_alignment,
	custom_allow_relations,
	custom_span_tags,
	custom_relations,
	fail_fast,
	]
	outputs = [status_box, error_md, ast_output]

	with gr.Row():
	gr.Examples(
	examples=EXAMPLES,
	inputs=inputs[:2],
	outputs=outputs,
	label="Examples",
	examples_per_page=10,
	cache_examples=True,
	fn=validate_example,
	preload=5,
	)

	gr.HTML(
	'<div class="bcql-footer">Built with '
	'<a href="https://bramvanroy.github.io/bcql_py/">bcql_py</a> · '
	'<a href="https://gradio.app/">Gradio</a></div>'
	)

	validate_btn.click(validate_query, inputs=inputs, outputs=outputs)
	# On preset change, refresh the Active spec tab and re-validate.
	preset_dropdown.change(
	render_spec_description,
	inputs=preset_dropdown,
	outputs=spec_md,
	)
	preset_dropdown.change(validate_query, inputs=inputs, outputs=outputs)
	use_custom.change(validate_query, inputs=inputs, outputs=outputs)

	demo.launch(theme=THEME, css=CUSTOM_CSS)