BramVanroy's picture
Sync from GitHub via hub-sync
b6496cb verified
"""Gradio demo for ``bcql_py`` query validation.
A small, illustrative web UI that lets users paste a BlackLab Corpus Query Language
(BCQL) query, optionally pick or customize a [CorpusSpec][bcql_py.validation.CorpusSpec],
and inspect parsing / validation results in real time.
Run locally with::
uv sync --group app
uv run python app/app.py
The same script powers the hosted demo on Hugging Face Spaces.
"""
from __future__ import annotations
from typing import Any
import gradio as gr
from bcql_py import (
BCQLSyntaxError,
BCQLValidationError,
CorpusSpec,
parse,
)
from bcql_py.validation.presets import LASSY, UD
PRESETS: dict[str, CorpusSpec | None] = {
"None (permissive)": None,
"Universal Dependencies (UD)": UD,
"Lassy / Alpino": LASSY,
}
EXAMPLES: list[list[str]] = [
['"man"', "None (permissive)"],
['[lemma="search" & pos="NOUN"]', "Universal Dependencies (UD)"],
['"the" [pos="ADJ"]+ "man"', "Universal Dependencies (UD)"],
['<s/> containing "fluffy"', "None (permissive)"],
['"baker" within <ne type="PERS"/>', "None (permissive)"],
['[pos="VERB"] -nsubj-> [pos="NOUN"]', "Universal Dependencies (UD)"],
['A:[pos="ADJ"] "man"', "Universal Dependencies (UD)"],
['[pos="NOUN" &]', "None (permissive)"],
['[pos="BANANA"]', "Universal Dependencies (UD)"],
['[unknownattr="x"]', "Universal Dependencies (UD)"],
]
TAB_LABELS: tuple[str, ...] = (
"AST (JSON)",
"Active spec",
)
# Indigo (mkdocs-material primary) shades:
INDIGO = gr.themes.Color(
name="indigo",
c50="#e8eaf6",
c100="#c5cae9",
c200="#9fa8da",
c300="#7986cb",
c400="#5c6bc0",
c500="#3f51b5",
c600="#3949ab",
c700="#303f9f",
c800="#283593",
c900="#1a237e",
c950="#0d1442",
)
THEME = gr.themes.Soft(
primary_hue=INDIGO,
secondary_hue=INDIGO,
neutral_hue="slate",
font=(
gr.themes.GoogleFont("Roboto"),
"ui-sans-serif",
"system-ui",
"sans-serif",
),
font_mono=(
gr.themes.GoogleFont("Roboto Mono"),
"ui-monospace",
"Consolas",
"monospace",
),
).set(
body_background_fill="*neutral_50",
body_background_fill_dark="*neutral_950",
block_radius="*radius_lg",
button_primary_background_fill="*primary_500",
button_primary_background_fill_hover="*primary_600",
button_primary_text_color="white",
)
# Theming: https://www.gradio.app/guides/theming-guide#extending-themes-via-set
CUSTOM_CSS = """
.bcql-header {
text-align: center;
padding: 1.25rem 0 0.5rem 0;
}
.bcql-header h1 {
font-weight: 700;
letter-spacing: -0.02em;
margin: 0;
}
.bcql-header p {
color: var(--body-text-color-subdued);
margin-top: 0.4rem;
}
.bcql-status-ok {
background: linear-gradient(90deg, #e8f5e9 0%, #f1f8e9 100%);
border-left: 4px solid #2e7d32;
border-radius: 8px;
padding: 0.85rem 1rem;
color: #1b5e20;
font-weight: 600;
}
.bcql-status-err {
background: linear-gradient(90deg, #ffebee 0%, #fff3e0 100%);
border-left: 4px solid #c62828;
border-radius: 8px;
padding: 0.85rem 1rem;
color: #b71c1c;
font-weight: 600;
}
.dark .bcql-status-ok {
background: rgba(46, 125, 50, 0.15);
color: #a5d6a7;
}
.dark .bcql-status-err {
background: rgba(198, 40, 40, 0.18);
color: #ef9a9a;
}
.bcql-footer {
text-align: center;
color: var(--body-text-color-subdued);
padding: 1rem 0 0.5rem 0;
font-size: 0.9rem;
}
"""
EMPTY_AST: dict[str, Any] = {}
EMPTY_STATUS: str = (
'<div class="bcql-status-ok" style="opacity: 0.6;">'
"Enter a BCQL query above and click <b>Validate query</b>.</div>"
)
def _parse_csv(value: str) -> list[str]:
"""Split a comma/whitespace-separated string into a clean list of names."""
if not value:
return []
parts: list[str] = []
for chunk in value.replace("\n", ",").split(","):
chunk = chunk.strip()
if chunk:
parts.append(chunk)
return parts
def _parse_closed_attributes(text: str) -> dict[str, list[str]]:
"""Parse a textarea of ``key: val1, val2`` lines into a dict.
Lines starting with ``#`` and blank lines are ignored. Raises ``ValueError``
on malformed lines so the UI can show a friendly error.
"""
result: dict[str, list[str]] = {}
for raw in (text or "").splitlines():
line = raw.strip()
if not line or line.startswith("#"):
continue
if ":" not in line:
raise ValueError(
f"Closed attributes line must look like 'key: val1, val2': {raw!r}"
)
key, _, values = line.partition(":")
key = key.strip()
if not key:
raise ValueError(f"Empty annotation name in line: {raw!r}")
result[key] = _parse_csv(values)
return result
def _build_custom_spec(
open_attrs: str,
closed_attrs: str,
strict: bool,
allow_alignment: bool,
allow_relations: bool,
span_tags: str,
relations: str,
) -> CorpusSpec:
"""Construct a [CorpusSpec][bcql_py.validation.CorpusSpec] from the custom-spec form fields."""
closed = _parse_closed_attributes(closed_attrs)
span_tag_list = _parse_csv(span_tags)
relation_list = _parse_csv(relations)
return CorpusSpec(
open_attributes=frozenset(_parse_csv(open_attrs)),
closed_attributes={k: frozenset(v) for k, v in closed.items()},
strict_attributes=strict,
allow_alignment=allow_alignment,
allow_relations=allow_relations,
allowed_span_tags=frozenset(span_tag_list) if span_tag_list else None,
allowed_relations=frozenset(relation_list) if relation_list else None,
)
def _format_syntax_error(error: BCQLSyntaxError) -> str:
"""Format a [BCQLSyntaxError][bcql_py.exceptions.BCQLSyntaxError] as a fenced markdown snippet with caret."""
lines = ["**Syntax error**", "", f"> {error.message}"]
if error.query and error.position is not None:
lines.extend(
[
"",
"```text",
error.query,
" " * error.position + "^",
"```",
]
)
return "\n".join(lines)
def _format_validation_error(error: BCQLValidationError) -> str:
"""Format a [BCQLValidationError][bcql_py.exceptions.BCQLValidationError] as a markdown bullet list."""
if len(error.issues) == 1:
issue = error.issues[0]
parts = [
"**Validation error**",
"",
f"- **{issue.kind}**: {issue.message}",
]
if issue.context:
parts.append("")
parts.append("**Context:**")
for key, val in issue.context.items():
parts.append(f" - {key}: {val!r}")
return "\n".join(parts)
parts = [f"**Found {len(error.issues)} validation issue(s):**", ""]
for issue in error.issues:
parts.append(f"- **{issue.kind}**: {issue.message}")
if issue.context:
ctx = ", ".join(f"`{k}={v!r}`" for k, v in issue.context.items())
parts.append(f" - {ctx}")
return "\n".join(parts)
def _ok_html(message: str) -> str:
return f'<div class="bcql-status-ok">✅ {message}</div>'
def _err_html(message: str) -> str:
return f'<div class="bcql-status-err">❌ {message}</div>'
def validate_query(
query: str,
preset_name: str,
use_custom: bool,
custom_open: str,
custom_closed: str,
custom_strict: bool,
custom_allow_alignment: bool,
custom_allow_relations: bool,
custom_span_tags: str,
custom_relations: str,
fail_fast: bool,
) -> tuple[str, str, dict[str, Any]]:
"""Run the parser/validator and return UI-ready values.
Returns:
Tuple of ``(status_html, error_markdown, ast_dict, canonical_bcql)``.
``ast_dict`` is a plain dict; empty when no AST was produced.
"""
query = (query or "").strip()
if not query:
return (EMPTY_STATUS, "", EMPTY_AST)
spec: CorpusSpec | None
if use_custom:
try:
spec = _build_custom_spec(
custom_open,
custom_closed,
custom_strict,
custom_allow_alignment,
custom_allow_relations,
custom_span_tags,
custom_relations,
)
except ValueError as exc:
return (
_err_html("Invalid custom corpus spec."),
f"**Custom spec error**\n\n> {exc}",
EMPTY_AST,
)
else:
spec = PRESETS.get(preset_name)
try:
ast = parse(query, spec=spec, fail_fast=fail_fast)
except BCQLSyntaxError as exc:
return (
_err_html("Query failed to parse."),
_format_syntax_error(exc),
EMPTY_AST,
)
except BCQLValidationError as exc:
# Try to also surface the parsed AST when validation (not parsing) fails:
try:
ast_only = parse(query)
ast_dict = ast_only.model_dump(mode="json")
except Exception:
ast_dict = EMPTY_AST
return (
_err_html(
"Query syntactically parses but does not match the corpus spec."
),
_format_validation_error(exc),
ast_dict,
)
label = (
"Query is syntactically valid."
if spec is None
else (
"Query is valid against the selected corpus spec "
f"({'custom' if use_custom else preset_name})."
)
)
return (_ok_html(label), "", ast.model_dump(mode="json"))
def validate_example(
query: str, preset_name: str
) -> tuple[str, str, dict[str, Any]]:
"""Wrapper around validate_query for ``gr.Examples`` (2-arg signature)."""
return validate_query(
query, preset_name, False, "", "", False, True, True, "", "", False
)
def render_spec_description(preset_name: str) -> str:
"""Markdown description of the selected preset's [CorpusSpec][bcql_py.validation.CorpusSpec]."""
spec = PRESETS.get(preset_name)
if spec is None:
return (
"_No corpus spec selected: only syntactic parsing is performed._\n\n"
"Pick a preset or enable **Custom corpus spec** to also run "
"semantic validation against a corpus vocabulary."
)
return spec.description
INTRO_HTML = """
<div class="bcql-header">
<h1>BCQL Validator</h1>
<p>Parse and validate <strong><a href="https://blacklab.ivdnt.org/" target="_blank">BlackLab</a></strong>
Corpus Query Language queries with
<strong><a href="https://bramvanroy.github.io/bcql_py/" target="_blank"><code>bcql_py</code></a></strong></p>
</div>
"""
ABOUT_MARKDOWN = """
## About
`bcql_py` is a Python parser for the BlackLab Corpus Query Language. It produces
a frozen Pydantic AST that round-trips back to BCQL or JSON, and ships an
optional semantic validation layer driven by a `CorpusSpec`.
This demo lets you:
- **Parse** any BCQL query and see the resulting AST as JSON.
- **Validate** the query against a built-in preset (`UD`, `Lassy`) or a
custom corpus spec you define on the fly.
When parsing or validation fails, the error message points at the offending
position in the query: useful both for humans and for LLM-driven feedback
loops.
**Links:** [GitHub](https://github.com/BramVanroy/bcql_py) ·
[Documentation](https://bramvanroy.github.io/bcql_py/) ·
[BCQL cheatsheet](https://bramvanroy.github.io/bcql_py/guides/cheatsheet/)
"""
with gr.Blocks(
title="BCQL Validator",
analytics_enabled=False,
) as demo:
# Heading
gr.HTML(INTRO_HTML)
# Sidebar with about text
with gr.Sidebar(position="right"):
gr.Markdown(ABOUT_MARKDOWN)
# Main content: query input and results
with gr.Row(equal_height=False):
# Main column for input (query, spec, examples)
with gr.Column(scale=5):
query_input = gr.Textbox(
label="BCQL query",
value='[lemma="search" & pos="NOUN"]',
lines=4,
max_lines=12,
placeholder='[pos="NOUN"] "the" [pos="ADJ"]+ "man" ...',
)
with gr.Row():
preset_dropdown = gr.Dropdown(
choices=list(PRESETS.keys()),
value="None (permissive)",
label="Corpus preset",
scale=3,
)
fail_fast = gr.Checkbox(
value=False,
label="Fail fast",
info="Stop at the first issue instead of collecting all.",
scale=2,
)
with gr.Accordion("Custom corpus spec (advanced)", open=False):
use_custom = gr.Checkbox(
value=False,
label="Use custom spec instead of preset",
)
custom_open = gr.Textbox(
label="Open attributes",
placeholder="word, lemma, xpos",
info="Comma-separated annotation names with unconstrained values.",
)
custom_closed = gr.Textbox(
label="Closed attributes",
placeholder="pos: NOUN, VERB, ADJ\nNumber: Sing, Plur",
info="One per line: 'name: value1, value2, ...'.",
lines=4,
)
with gr.Row():
custom_strict = gr.Checkbox(
value=False,
label="Strict attributes",
info="Reject any annotation not listed above.",
)
custom_allow_alignment = gr.Checkbox(
value=True, label="Allow alignment (==>)"
)
custom_allow_relations = gr.Checkbox(
value=True, label="Allow relations (-->)"
)
custom_span_tags = gr.Textbox(
label="Allowed span tags",
placeholder="s, p, ne",
info="Leave empty to allow any tag.",
)
custom_relations = gr.Textbox(
label="Allowed relation labels",
placeholder="nsubj, obj, amod",
info="Leave empty to allow any relation label.",
)
validate_btn = gr.Button(
"Validate query", variant="primary", size="lg"
)
# Column for outputs (status, error, AST, canonical BCQL)
with gr.Column(scale=4):
status_box = gr.HTML(value=EMPTY_STATUS, label="Status")
error_md = gr.Markdown(
value="",
label="Error details",
latex_delimiters=None,
)
with gr.Tabs():
with gr.Tab(TAB_LABELS[0], render_children=True):
ast_output = gr.JSON(value=EMPTY_AST, label="")
with gr.Tab(TAB_LABELS[1], render_children=True):
spec_md = gr.Markdown(
value=render_spec_description("None (permissive)"),
)
inputs = [
query_input,
preset_dropdown,
use_custom,
custom_open,
custom_closed,
custom_strict,
custom_allow_alignment,
custom_allow_relations,
custom_span_tags,
custom_relations,
fail_fast,
]
outputs = [status_box, error_md, ast_output]
with gr.Row():
gr.Examples(
examples=EXAMPLES,
inputs=inputs[:2],
outputs=outputs,
label="Examples",
examples_per_page=10,
cache_examples=True,
fn=validate_example,
preload=5,
)
gr.HTML(
'<div class="bcql-footer">Built with '
'<a href="https://bramvanroy.github.io/bcql_py/">bcql_py</a> · '
'<a href="https://gradio.app/">Gradio</a></div>'
)
validate_btn.click(validate_query, inputs=inputs, outputs=outputs)
# On preset change, refresh the Active spec tab and re-validate.
preset_dropdown.change(
render_spec_description,
inputs=preset_dropdown,
outputs=spec_md,
)
preset_dropdown.change(validate_query, inputs=inputs, outputs=outputs)
use_custom.change(validate_query, inputs=inputs, outputs=outputs)
demo.launch(theme=THEME, css=CUSTOM_CSS)