Spaces:

BramVanroy
/

bcql_py_validation

Sleeping

File size: 16,647 Bytes

"""Gradio demo for ``bcql_py`` query validation.

A small, illustrative web UI that lets users paste a BlackLab Corpus Query Language
(BCQL) query, optionally pick or customize a [CorpusSpec][bcql_py.validation.CorpusSpec],
and inspect parsing / validation results in real time.

Run locally with::

    uv sync --group app
    uv run python app/app.py

The same script powers the hosted demo on Hugging Face Spaces.
"""

from __future__ import annotations

from typing import Any

import gradio as gr

from bcql_py import (
    BCQLSyntaxError,
    BCQLValidationError,
    CorpusSpec,
    parse,
)
from bcql_py.validation.presets import LASSY, UD


PRESETS: dict[str, CorpusSpec | None] = {
    "None (permissive)": None,
    "Universal Dependencies (UD)": UD,
    "Lassy / Alpino": LASSY,
}

EXAMPLES: list[list[str]] = [
    ['"man"', "None (permissive)"],
    ['[lemma="search" & pos="NOUN"]', "Universal Dependencies (UD)"],
    ['"the" [pos="ADJ"]+ "man"', "Universal Dependencies (UD)"],
    ['<s/> containing "fluffy"', "None (permissive)"],
    ['"baker" within <ne type="PERS"/>', "None (permissive)"],
    ['[pos="VERB"] -nsubj-> [pos="NOUN"]', "Universal Dependencies (UD)"],
    ['A:[pos="ADJ"] "man"', "Universal Dependencies (UD)"],
    ['[pos="NOUN" &]', "None (permissive)"],
    ['[pos="BANANA"]', "Universal Dependencies (UD)"],
    ['[unknownattr="x"]', "Universal Dependencies (UD)"],
]

TAB_LABELS: tuple[str, ...] = (
    "AST (JSON)",
    "Active spec",
)


# Indigo (mkdocs-material primary) shades:
INDIGO = gr.themes.Color(
    name="indigo",
    c50="#e8eaf6",
    c100="#c5cae9",
    c200="#9fa8da",
    c300="#7986cb",
    c400="#5c6bc0",
    c500="#3f51b5",
    c600="#3949ab",
    c700="#303f9f",
    c800="#283593",
    c900="#1a237e",
    c950="#0d1442",
)

THEME = gr.themes.Soft(
    primary_hue=INDIGO,
    secondary_hue=INDIGO,
    neutral_hue="slate",
    font=(
        gr.themes.GoogleFont("Roboto"),
        "ui-sans-serif",
        "system-ui",
        "sans-serif",
    ),
    font_mono=(
        gr.themes.GoogleFont("Roboto Mono"),
        "ui-monospace",
        "Consolas",
        "monospace",
    ),
).set(
    body_background_fill="*neutral_50",
    body_background_fill_dark="*neutral_950",
    block_radius="*radius_lg",
    button_primary_background_fill="*primary_500",
    button_primary_background_fill_hover="*primary_600",
    button_primary_text_color="white",
)

# Theming: https://www.gradio.app/guides/theming-guide#extending-themes-via-set
CUSTOM_CSS = """
.bcql-header {
    text-align: center;
    padding: 1.25rem 0 0.5rem 0;
}
.bcql-header h1 {
    font-weight: 700;
    letter-spacing: -0.02em;
    margin: 0;
}
.bcql-header p {
    color: var(--body-text-color-subdued);
    margin-top: 0.4rem;
}
.bcql-status-ok {
    background: linear-gradient(90deg, #e8f5e9 0%, #f1f8e9 100%);
    border-left: 4px solid #2e7d32;
    border-radius: 8px;
    padding: 0.85rem 1rem;
    color: #1b5e20;
    font-weight: 600;
}
.bcql-status-err {
    background: linear-gradient(90deg, #ffebee 0%, #fff3e0 100%);
    border-left: 4px solid #c62828;
    border-radius: 8px;
    padding: 0.85rem 1rem;
    color: #b71c1c;
    font-weight: 600;
}
.dark .bcql-status-ok {
    background: rgba(46, 125, 50, 0.15);
    color: #a5d6a7;
}
.dark .bcql-status-err {
    background: rgba(198, 40, 40, 0.18);
    color: #ef9a9a;
}
.bcql-footer {
    text-align: center;
    color: var(--body-text-color-subdued);
    padding: 1rem 0 0.5rem 0;
    font-size: 0.9rem;
}
"""

EMPTY_AST: dict[str, Any] = {}
EMPTY_STATUS: str = (
    '<div class="bcql-status-ok" style="opacity: 0.6;">'
    "Enter a BCQL query above and click <b>Validate query</b>.</div>"
)


def _parse_csv(value: str) -> list[str]:
    """Split a comma/whitespace-separated string into a clean list of names."""
    if not value:
        return []
    parts: list[str] = []
    for chunk in value.replace("\n", ",").split(","):
        chunk = chunk.strip()
        if chunk:
            parts.append(chunk)
    return parts


def _parse_closed_attributes(text: str) -> dict[str, list[str]]:
    """Parse a textarea of ``key: val1, val2`` lines into a dict.

    Lines starting with ``#`` and blank lines are ignored. Raises ``ValueError``
    on malformed lines so the UI can show a friendly error.
    """
    result: dict[str, list[str]] = {}
    for raw in (text or "").splitlines():
        line = raw.strip()
        if not line or line.startswith("#"):
            continue
        if ":" not in line:
            raise ValueError(
                f"Closed attributes line must look like 'key: val1, val2': {raw!r}"
            )
        key, _, values = line.partition(":")
        key = key.strip()
        if not key:
            raise ValueError(f"Empty annotation name in line: {raw!r}")
        result[key] = _parse_csv(values)
    return result


def _build_custom_spec(
    open_attrs: str,
    closed_attrs: str,
    strict: bool,
    allow_alignment: bool,
    allow_relations: bool,
    span_tags: str,
    relations: str,
) -> CorpusSpec:
    """Construct a [CorpusSpec][bcql_py.validation.CorpusSpec] from the custom-spec form fields."""
    closed = _parse_closed_attributes(closed_attrs)
    span_tag_list = _parse_csv(span_tags)
    relation_list = _parse_csv(relations)
    return CorpusSpec(
        open_attributes=frozenset(_parse_csv(open_attrs)),
        closed_attributes={k: frozenset(v) for k, v in closed.items()},
        strict_attributes=strict,
        allow_alignment=allow_alignment,
        allow_relations=allow_relations,
        allowed_span_tags=frozenset(span_tag_list) if span_tag_list else None,
        allowed_relations=frozenset(relation_list) if relation_list else None,
    )


def _format_syntax_error(error: BCQLSyntaxError) -> str:
    """Format a [BCQLSyntaxError][bcql_py.exceptions.BCQLSyntaxError] as a fenced markdown snippet with caret."""
    lines = ["**Syntax error**", "", f"> {error.message}"]
    if error.query and error.position is not None:
        lines.extend(
            [
                "",
                "```text",
                error.query,
                " " * error.position + "^",
                "```",
            ]
        )
    return "\n".join(lines)


def _format_validation_error(error: BCQLValidationError) -> str:
    """Format a [BCQLValidationError][bcql_py.exceptions.BCQLValidationError] as a markdown bullet list."""
    if len(error.issues) == 1:
        issue = error.issues[0]
        parts = [
            "**Validation error**",
            "",
            f"- **{issue.kind}**: {issue.message}",
        ]
        if issue.context:
            parts.append("")
            parts.append("**Context:**")
            for key, val in issue.context.items():
                parts.append(f"  - {key}: {val!r}")
        return "\n".join(parts)

    parts = [f"**Found {len(error.issues)} validation issue(s):**", ""]
    for issue in error.issues:
        parts.append(f"- **{issue.kind}**: {issue.message}")
        if issue.context:
            ctx = ", ".join(f"`{k}={v!r}`" for k, v in issue.context.items())
            parts.append(f"  - {ctx}")
    return "\n".join(parts)


def _ok_html(message: str) -> str:
    return f'<div class="bcql-status-ok">✅ {message}</div>'


def _err_html(message: str) -> str:
    return f'<div class="bcql-status-err">❌ {message}</div>'


def validate_query(
    query: str,
    preset_name: str,
    use_custom: bool,
    custom_open: str,
    custom_closed: str,
    custom_strict: bool,
    custom_allow_alignment: bool,
    custom_allow_relations: bool,
    custom_span_tags: str,
    custom_relations: str,
    fail_fast: bool,
) -> tuple[str, str, dict[str, Any]]:
    """Run the parser/validator and return UI-ready values.

    Returns:
        Tuple of ``(status_html, error_markdown, ast_dict, canonical_bcql)``.
        ``ast_dict`` is a plain dict; empty when no AST was produced.
    """
    query = (query or "").strip()
    if not query:
        return (EMPTY_STATUS, "", EMPTY_AST)

    spec: CorpusSpec | None
    if use_custom:
        try:
            spec = _build_custom_spec(
                custom_open,
                custom_closed,
                custom_strict,
                custom_allow_alignment,
                custom_allow_relations,
                custom_span_tags,
                custom_relations,
            )
        except ValueError as exc:
            return (
                _err_html("Invalid custom corpus spec."),
                f"**Custom spec error**\n\n> {exc}",
                EMPTY_AST,
            )
    else:
        spec = PRESETS.get(preset_name)

    try:
        ast = parse(query, spec=spec, fail_fast=fail_fast)
    except BCQLSyntaxError as exc:
        return (
            _err_html("Query failed to parse."),
            _format_syntax_error(exc),
            EMPTY_AST,
        )
    except BCQLValidationError as exc:
        # Try to also surface the parsed AST when validation (not parsing) fails:
        try:
            ast_only = parse(query)
            ast_dict = ast_only.model_dump(mode="json")
        except Exception:
            ast_dict = EMPTY_AST
        return (
            _err_html(
                "Query syntactically parses but does not match the corpus spec."
            ),
            _format_validation_error(exc),
            ast_dict,
        )

    label = (
        "Query is syntactically valid."
        if spec is None
        else (
            "Query is valid against the selected corpus spec "
            f"({'custom' if use_custom else preset_name})."
        )
    )
    return (_ok_html(label), "", ast.model_dump(mode="json"))


def validate_example(
    query: str, preset_name: str
) -> tuple[str, str, dict[str, Any]]:
    """Wrapper around validate_query for ``gr.Examples`` (2-arg signature)."""
    return validate_query(
        query, preset_name, False, "", "", False, True, True, "", "", False
    )


def render_spec_description(preset_name: str) -> str:
    """Markdown description of the selected preset's [CorpusSpec][bcql_py.validation.CorpusSpec]."""
    spec = PRESETS.get(preset_name)
    if spec is None:
        return (
            "_No corpus spec selected: only syntactic parsing is performed._\n\n"
            "Pick a preset or enable **Custom corpus spec** to also run "
            "semantic validation against a corpus vocabulary."
        )
    return spec.description


INTRO_HTML = """
<div class="bcql-header">
  <h1>BCQL Validator</h1>
  <p>Parse and validate <strong><a href="https://blacklab.ivdnt.org/" target="_blank">BlackLab</a></strong>
  Corpus Query Language queries with
  <strong><a href="https://bramvanroy.github.io/bcql_py/" target="_blank"><code>bcql_py</code></a></strong></p>
</div>
"""

ABOUT_MARKDOWN = """
## About

`bcql_py` is a Python parser for the BlackLab Corpus Query Language. It produces
a frozen Pydantic AST that round-trips back to BCQL or JSON, and ships an
optional semantic validation layer driven by a `CorpusSpec`.

This demo lets you:

- **Parse** any BCQL query and see the resulting AST as JSON.
- **Validate** the query against a built-in preset (`UD`, `Lassy`) or a
  custom corpus spec you define on the fly.

When parsing or validation fails, the error message points at the offending
position in the query: useful both for humans and for LLM-driven feedback
loops.

**Links:** [GitHub](https://github.com/BramVanroy/bcql_py) ·
[Documentation](https://bramvanroy.github.io/bcql_py/) ·
[BCQL cheatsheet](https://bramvanroy.github.io/bcql_py/guides/cheatsheet/)
"""

with gr.Blocks(
    title="BCQL Validator",
    analytics_enabled=False,
) as demo:
    # Heading
    gr.HTML(INTRO_HTML)

    # Sidebar with about text
    with gr.Sidebar(position="right"):
        gr.Markdown(ABOUT_MARKDOWN)

    # Main content: query input and results
    with gr.Row(equal_height=False):
        # Main column for input (query, spec, examples)
        with gr.Column(scale=5):
            query_input = gr.Textbox(
                label="BCQL query",
                value='[lemma="search" & pos="NOUN"]',
                lines=4,
                max_lines=12,
                placeholder='[pos="NOUN"]   "the" [pos="ADJ"]+ "man"   ...',
            )

            with gr.Row():
                preset_dropdown = gr.Dropdown(
                    choices=list(PRESETS.keys()),
                    value="None (permissive)",
                    label="Corpus preset",
                    scale=3,
                )
                fail_fast = gr.Checkbox(
                    value=False,
                    label="Fail fast",
                    info="Stop at the first issue instead of collecting all.",
                    scale=2,
                )

            with gr.Accordion("Custom corpus spec (advanced)", open=False):
                use_custom = gr.Checkbox(
                    value=False,
                    label="Use custom spec instead of preset",
                )
                custom_open = gr.Textbox(
                    label="Open attributes",
                    placeholder="word, lemma, xpos",
                    info="Comma-separated annotation names with unconstrained values.",
                )
                custom_closed = gr.Textbox(
                    label="Closed attributes",
                    placeholder="pos: NOUN, VERB, ADJ\nNumber: Sing, Plur",
                    info="One per line: 'name: value1, value2, ...'.",
                    lines=4,
                )
                with gr.Row():
                    custom_strict = gr.Checkbox(
                        value=False,
                        label="Strict attributes",
                        info="Reject any annotation not listed above.",
                    )
                    custom_allow_alignment = gr.Checkbox(
                        value=True, label="Allow alignment (==>)"
                    )
                    custom_allow_relations = gr.Checkbox(
                        value=True, label="Allow relations (-->)"
                    )
                custom_span_tags = gr.Textbox(
                    label="Allowed span tags",
                    placeholder="s, p, ne",
                    info="Leave empty to allow any tag.",
                )
                custom_relations = gr.Textbox(
                    label="Allowed relation labels",
                    placeholder="nsubj, obj, amod",
                    info="Leave empty to allow any relation label.",
                )

            validate_btn = gr.Button(
                "Validate query", variant="primary", size="lg"
            )

        # Column for outputs (status, error, AST, canonical BCQL)
        with gr.Column(scale=4):
            status_box = gr.HTML(value=EMPTY_STATUS, label="Status")
            error_md = gr.Markdown(
                value="",
                label="Error details",
                latex_delimiters=None,
            )

            with gr.Tabs():
                with gr.Tab(TAB_LABELS[0], render_children=True):
                    ast_output = gr.JSON(value=EMPTY_AST, label="")
                with gr.Tab(TAB_LABELS[1], render_children=True):
                    spec_md = gr.Markdown(
                        value=render_spec_description("None (permissive)"),
                    )
    inputs = [
        query_input,
        preset_dropdown,
        use_custom,
        custom_open,
        custom_closed,
        custom_strict,
        custom_allow_alignment,
        custom_allow_relations,
        custom_span_tags,
        custom_relations,
        fail_fast,
    ]
    outputs = [status_box, error_md, ast_output]

    with gr.Row():
        gr.Examples(
            examples=EXAMPLES,
            inputs=inputs[:2],
            outputs=outputs,
            label="Examples",
            examples_per_page=10,
            cache_examples=True,
            fn=validate_example,
            preload=5,
        )

    gr.HTML(
        '<div class="bcql-footer">Built with '
        '<a href="https://bramvanroy.github.io/bcql_py/">bcql_py</a> · '
        '<a href="https://gradio.app/">Gradio</a></div>'
    )

    validate_btn.click(validate_query, inputs=inputs, outputs=outputs)
    # On preset change, refresh the Active spec tab and re-validate.
    preset_dropdown.change(
        render_spec_description,
        inputs=preset_dropdown,
        outputs=spec_md,
    )
    preset_dropdown.change(validate_query, inputs=inputs, outputs=outputs)
    use_custom.change(validate_query, inputs=inputs, outputs=outputs)

demo.launch(theme=THEME, css=CUSTOM_CSS)