Spaces:

Optitransfer
/

crdt-merge-data

Sleeping

File size: 33,510 Bytes

# SPDX-License-Identifier: BUSL-1.1
# Copyright 2026 Ryan Gillespie / Optitransfer
#
# Licensed under the Business Source License 1.1 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://github.com/mgillr/crdt-merge/blob/main/LICENSE
#
# Change Date: 2028-03-29
# Change License: Apache License, Version 2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# On 2028-03-29 this file converts to Apache License, Version 2.0.

"""
crdt-merge v0.9.5 — Data Playground HuggingFace Space
Tabular CRDT merge, conflict analysis, and core primitive demonstrations.
"""

import os
import json
import time
import numpy as np
import gradio as gr
import plotly.graph_objects as go

CSS = """
.gradio-container { background: #09090b !important; font-family: 'Inter', system-ui, sans-serif !important; }
.gr-button-primary { background: linear-gradient(135deg, #2563eb, #1d4ed8) !important; border: none !important; color: #fff !important; font-weight: 600 !important; }
footer { display: none !important; }
.tab-nav button { color: #a1a1aa !important; font-size: 13px !important; letter-spacing: 0.05em !important; text-transform: uppercase !important; font-weight: 600 !important; padding: 10px 16px !important; }
.tab-nav button.selected { color: #f4f4f5 !important; border-bottom: 2px solid #3b82f6 !important; }
.tab-nav button:hover { color: #e4e4e7 !important; }
code, .monospace { font-family: 'JetBrains Mono', ui-monospace, monospace !important; font-size: 13px !important; }
h1, h2, h3 { color: #f4f4f5 !important; }
p, li { color: #d4d4d8 !important; font-size: 15px !important; line-height: 1.7 !important; }
label, .gr-input-label, .label-wrap span { color: #e4e4e7 !important; font-size: 14px !important; font-weight: 500 !important; }
input, textarea, select, .gr-box { color: #f4f4f5 !important; background: #18181b !important; border-color: #3f3f46 !important; }
.gr-dataframe th, table th { color: #f4f4f5 !important; background: #18181b !important; font-weight: 600 !important; font-size: 13px !important; }
.gr-dataframe td, table td { color: #d4d4d8 !important; font-size: 13px !important; border-color: #27272a !important; }
.gr-dataframe tr:hover td { background: #1e1e22 !important; }
.gr-info, .info { color: #a1a1aa !important; font-size: 12px !important; }
strong { color: #f4f4f5 !important; }
"""

PLOTLY_LAYOUT = dict(
    paper_bgcolor="#09090b",
    plot_bgcolor="#18181b",
    font=dict(color="#a1a1aa", family="Inter"),
    xaxis=dict(gridcolor="#27272a", linecolor="#27272a"),
    yaxis=dict(gridcolor="#27272a", linecolor="#27272a"),
    margin=dict(l=60, r=20, t=40, b=60),
)

THEME = gr.themes.Base(
    primary_hue=gr.themes.colors.blue,
    neutral_hue=gr.themes.colors.zinc,
)

NAV_MD = """**[🏠 Flagship](https://huggingface.co/spaces/optitransfer/crdt-merge) · [🔬 Data Playground](https://huggingface.co/spaces/optitransfer/crdt-merge-data) · [🌐 Federation](https://huggingface.co/spaces/optitransfer/crdt-merge-federation) · [GitHub ↗](https://github.com/mgillr/crdt-merge) · [⭐ Star Repo](https://github.com/mgillr/crdt-merge/stargazers) · [👁️ Watch](https://github.com/mgillr/crdt-merge/subscription) · [📐 Architecture Deep Dive](https://github.com/mgillr/crdt-merge/tree/main/docs/architecture) · [PyPI ↗](https://pypi.org/project/crdt-merge/)**"""

HERO_MD = """
# crdt-merge — Data Playground

Tabular CRDT merge for DataFrames and datasets. Conflict-free record merge, deduplication, and provenance tracking.

`pip install crdt-merge` · [GitHub](https://github.com/mgillr/crdt-merge) · [PyPI](https://pypi.org/project/crdt-merge/) · Patent UK 2607132.4, GB2608127.3 · E4 Trust-Delta Architecture
"""

STRATEGIES_DF = ["LWW", "MaxWins", "MinWins", "Union"]


# -----------------------------------------------------------------
# Data loading
# -----------------------------------------------------------------

def _load_dataset_records():
    """Try HF datasets first, fallback to synthetic."""
    source = "synthetic"
    records_a = []
    records_b = []

    try:
        from datasets import load_dataset
        ds = load_dataset("glue", "sst2", split="train[:200]")
        all_records = [{"id": i, "sentence": ds[i]["sentence"], "label": ds[i]["label"], "_ts": i}
                       for i in range(len(ds))]
        records_a = all_records[:150]
        # Node B: overlapping records (100-149) get modified values + later timestamps
        records_b = []
        for r in all_records[100:]:
            rid = r["id"]
            if rid < 150:  # overlapping region -- simulate a different node's edits
                records_b.append({
                    "id": rid,
                    "sentence": r["sentence"].strip() + " [node-B edit]",
                    "label": 1 - r["label"],  # flip label to create real conflict
                    "_ts": rid + 50,           # later timestamp for LWW
                })
            else:
                records_b.append(r)
        source = "glue/sst2 (HuggingFace datasets, 200 rows, 50 conflicting overlap)"
    except Exception:
        pass

    if not records_a:
        rng = np.random.RandomState(7)
        adjectives = ["good", "bad", "great", "poor", "excellent", "terrible", "fine", "awful"]
        nouns = ["film", "movie", "picture", "show", "performance", "script", "cast", "story"]
        for i in range(200):
            adj = adjectives[i % len(adjectives)]
            noun = nouns[i % len(nouns)]
            records_a.append({"id": i, "sentence": f"A {adj} {noun}.", "label": i % 2, "_ts": i})
        for i in range(100, 200):
            adj = adjectives[(i + 3) % len(adjectives)]
            noun = nouns[(i + 2) % len(nouns)]
            records_b.append({"id": i, "sentence": f"An {adj} {noun}.", "label": (i + 1) % 2, "_ts": i + 50})
        for i in range(200, 250):
            adj = adjectives[i % len(adjectives)]
            noun = nouns[i % len(nouns)]
            records_b.append({"id": i, "sentence": f"The {adj} {noun}.", "label": i % 2, "_ts": i})
        source = "synthetic (SST-2 style, 150 + 100 records with 50 overlap)"

    return records_a, records_b, source


# -----------------------------------------------------------------
# TAB 1 -- Dataset Merge
# -----------------------------------------------------------------

def run_dataset_merge(strategy_name: str):
    from crdt_merge.dataframe import merge as df_merge
    from crdt_merge.strategies import MergeSchema, LWW, MaxWins, MinWins, UnionSet

    strategy_map = {
        "LWW": LWW(),
        "MaxWins": MaxWins(),
        "MinWins": MinWins(),
        "Union": UnionSet(),
    }
    schema = MergeSchema(default=strategy_map[strategy_name])

    records_a, records_b, source = _load_dataset_records()
    t0 = time.perf_counter()

    try:
        merged = df_merge(records_a, records_b, key="id", schema=schema, timestamp_col="_ts")
        elapsed = (time.perf_counter() - t0) * 1000

        # Verify commutativity
        merged_ba = df_merge(records_b, records_a, key="id", schema=schema, timestamp_col="_ts")
        ids_ab = sorted([r["id"] for r in merged])
        ids_ba = sorted([r["id"] for r in merged_ba])
        comm_pass = ids_ab == ids_ba

        summary_md = f"""
**Dataset Merge Complete**

| Metric | Value |
|---|---|
| Source | {source} |
| Strategy | {strategy_name} |
| Node A records | {len(records_a)} |
| Node B records | {len(records_b)} |
| Overlapping IDs | {len(set(r['id'] for r in records_a) & set(r['id'] for r in records_b))} |
| Merged records | {len(merged)} |
| Elapsed | {elapsed:.1f}ms |
| Commutative (merge_AB == merge_BA) | **{"PASS" if comm_pass else "FAIL"}** |

### Understanding the Results

- **Merged Records Table:** Shows the first 20 rows after merging Node A and Node B. For overlapping record IDs (where both nodes have the same row but different values), the selected strategy decides which value wins.
- **Strategy Behavior:**
  - `LWW` (Last-Writer-Wins) — the record with the **later timestamp** (`_ts`) wins. This is the most common strategy in distributed databases.
  - `MaxWins` — for numeric fields, the **larger value** wins. For text, lexicographic max.
  - `MinWins` — the **smaller value** wins. Useful for minimum-bid auctions or earliest-deadline scenarios.
  - `Union` — keeps **all values** as a set (no data is lost, but deduplication may be needed downstream).
- **Commutativity PASS** means `merge(A, B)` and `merge(B, A)` produce identical results — a core CRDT guarantee. This ensures any two replicas performing the merge get the same output regardless of order.
"""

        # E4 Trust Layer -- trust scores and Merkle provenance for the merge
        e4_md = ""
        try:
            from crdt_merge.e4 import TypedTrustScore
            from crdt_merge.e4.delta_trust_lattice import DeltaTrustLattice
            from crdt_merge.e4.trust_bound_merkle import TrustBoundMerkle

            ids_a = set(r["id"] for r in records_a)
            ids_b = set(r["id"] for r in records_b)

            lattice_a = DeltaTrustLattice(peer_id="node_A")
            lattice_b = DeltaTrustLattice(peer_id="node_B")

            score_a_self = lattice_a.get_trust("node_A")
            score_b_self = lattice_b.get_trust("node_B")
            score_a_from_b = lattice_b.get_trust("node_A")
            score_b_from_a = lattice_a.get_trust("node_B")

            merkle = TrustBoundMerkle(trust_lattice=lattice_a)
            for r in merged:
                originator = "node_A"
                if r["id"] in ids_b and r["id"] not in ids_a:
                    originator = "node_B"
                elif r["id"] in ids_b and r["id"] in ids_a:
                    originator = "node_B" if r.get("_ts", 0) >= 150 else "node_A"
                merkle.insert_leaf(
                    key=str(r["id"]),
                    data=json.dumps(r, default=str).encode(),
                    originator=originator,
                )
            root_hash = merkle.recompute()

            e4_md = f"""
---
### E4 Trust Layer

| Peer | Lattice | Overall Trust | Status |
|------|---------|--------------|--------|
| node_A | node_A (self) | {score_a_self.overall_trust():.3f} | {"Probationary" if score_a_self.overall_trust() <= 0.5 else "Trusted"} |
| node_B | node_B (self) | {score_b_self.overall_trust():.3f} | {"Probationary" if score_b_self.overall_trust() <= 0.5 else "Trusted"} |
| node_A | node_B (cross) | {score_a_from_b.overall_trust():.3f} | {"Probationary" if score_a_from_b.overall_trust() <= 0.5 else "Trusted"} |
| node_B | node_A (cross) | {score_b_from_a.overall_trust():.3f} | {"Probationary" if score_b_from_a.overall_trust() <= 0.5 else "Trusted"} |

**Merkle Provenance Root:** `{root_hash}`
**Merged records in Merkle tree:** {len(merged)}
**Trust scoring:** All merge participants start at probationary (0.5) trust. Trust accrues over time via successful merges and evidence accumulation.
"""
        except Exception as e:
            e4_md = f"\n\n---\n### E4 Trust Layer\n\nE4 trust module unavailable: {e}\n"

        summary_md = summary_md + e4_md

        display_rows = merged[:20]
        return display_rows, summary_md

    except Exception as e:
        return [], f"Error: {e}"


# -----------------------------------------------------------------
# TAB 2 -- Conflict Analysis
# -----------------------------------------------------------------

def run_conflict_analysis():
    from crdt_merge.dataframe import merge as df_merge
    from crdt_merge.strategies import MergeSchema, LWW, MaxWins, MinWins, UnionSet

    records_a, records_b, source = _load_dataset_records()
    overlap_ids = set(r["id"] for r in records_a) & set(r["id"] for r in records_b)

    strategy_map = {
        "LWW": LWW(),
        "MaxWins": MaxWins(),
        "MinWins": MinWins(),
        "Union": UnionSet(),
    }

    fields = ["sentence", "label"]
    results_by_strategy = {}

    for strat_name, strat in strategy_map.items():
        schema = MergeSchema(default=strat)
        try:
            merged = df_merge(records_a, records_b, key="id", schema=schema, timestamp_col="_ts")
            results_by_strategy[strat_name] = {r["id"]: r for r in merged if r["id"] in overlap_ids}
        except Exception as e:
            results_by_strategy[strat_name] = {}

    # Build conflict matrix: per-field, per-strategy-pair, how many records differ
    strat_names = list(strategy_map.keys())
    conflict_matrix = {}
    for field in fields:
        conflict_matrix[field] = np.zeros((len(strat_names), len(strat_names)), dtype=np.float32)
        for i, s1 in enumerate(strat_names):
            for j, s2 in enumerate(strat_names):
                if i == j:
                    continue
                diffs = 0
                total = 0
                for rid in overlap_ids:
                    r1 = results_by_strategy[s1].get(rid)
                    r2 = results_by_strategy[s2].get(rid)
                    if r1 is not None and r2 is not None:
                        total += 1
                        if str(r1.get(field, "")) != str(r2.get(field, "")):
                            diffs += 1
                conflict_matrix[field][i, j] = diffs / max(total, 1)

    # Heatmap: combine fields side by side
    combined_z = np.concatenate([conflict_matrix[f] for f in fields], axis=1)
    col_labels = [f"{f}:{s}" for f in fields for s in strat_names]

    fig = go.Figure(data=go.Heatmap(
        z=combined_z.tolist(),
        x=col_labels,
        y=strat_names,
        colorscale=[[0, "#18181b"], [1, "#3b82f6"]],
        showscale=True,
        colorbar=dict(title="Conflict Rate"),
    ))
    fig.update_layout(
        **PLOTLY_LAYOUT,
        title=f"Per-Field Conflict Matrix — Strategy vs Strategy (source: {source[:40]}...)",
        xaxis_title="Field : Strategy (column)",
        yaxis_title="Strategy (row)",
    )

    # Summary table: how many overlapping records each strategy resolves differently from LWW
    summary_rows = []
    for strat_name in strat_names:
        diffs_vs_lww = 0
        for rid in overlap_ids:
            r_lww = results_by_strategy["LWW"].get(rid)
            r_s = results_by_strategy[strat_name].get(rid)
            if r_lww and r_s:
                for field in fields:
                    if str(r_lww.get(field, "")) != str(r_s.get(field, "")):
                        diffs_vs_lww += 1
                        break
        summary_rows.append({
            "Strategy": strat_name,
            "Conflicts vs LWW": diffs_vs_lww,
            "Overlap Records": len(overlap_ids),
            "Conflict Rate": f"{diffs_vs_lww / max(len(overlap_ids), 1):.2%}",
        })

    return summary_rows, fig


def _e4_conflict_trust_analysis():
    """Run E4 trust evidence analysis for detected conflicts. Returns markdown string."""
    try:
        from crdt_merge.e4 import TypedTrustScore
        from crdt_merge.e4.delta_trust_lattice import DeltaTrustLattice
        from crdt_merge.e4.proof_evidence import TrustEvidence, EVIDENCE_TYPES

        records_a, records_b, source = _load_dataset_records()
        overlap_ids = set(r["id"] for r in records_a) & set(r["id"] for r in records_b)
        map_a = {r["id"]: r for r in records_a}
        map_b = {r["id"]: r for r in records_b}

        lattice = DeltaTrustLattice(peer_id="auditor")

        evidence_log = []

        # Detect conflict types and fire evidence
        equivocation_count = 0
        invalid_delta_count = 0

        for rid in sorted(overlap_ids):
            ra = map_a.get(rid)
            rb = map_b.get(rid)
            if ra is None or rb is None:
                continue

            # Same key, different values = equivocation evidence
            if str(ra.get("sentence", "")) != str(rb.get("sentence", "")):
                ev = TrustEvidence.create(
                    observer="auditor",
                    target="node_B",
                    evidence_type="equivocation",
                    dimension="consistency",
                    amount=-0.05,
                    proof=f"id={rid} sentence diverged".encode(),
                )
                evidence_log.append(("equivocation", "node_B", rid, "consistency"))
                equivocation_count += 1

            # Label flip = invalid_delta evidence
            if ra.get("label") != rb.get("label"):
                ev = TrustEvidence.create(
                    observer="auditor",
                    target="node_B",
                    evidence_type="invalid_delta",
                    dimension="integrity",
                    amount=-0.1,
                    proof=f"id={rid} label flipped {ra.get('label')}->{rb.get('label')}".encode(),
                )
                evidence_log.append(("invalid_delta", "node_B", rid, "integrity"))
                invalid_delta_count += 1

        # Get trust scores after evidence
        score_a = lattice.get_trust("node_A")
        score_b = lattice.get_trust("node_B")

        # Build trust verdict table
        verdict_rows = []
        for ev_type, target, rid, dim in evidence_log[:10]:
            verdict_rows.append(f"| {ev_type} | {target} | {rid} | {dim} |")
        if len(evidence_log) > 10:
            verdict_rows.append(f"| ... | ... | ... | ... |")
            verdict_rows.append(f"| *(total {len(evidence_log)} events)* | | | |")

        verdict_table = "\n".join(verdict_rows)

        md = f"""
---
### E4 Trust Layer -- Conflict Evidence

**Evidence Events Fired:** {len(evidence_log)} total ({equivocation_count} equivocation, {invalid_delta_count} invalid_delta)

| Evidence Type | Target | Record ID | Dimension |
|--------------|--------|-----------|-----------|
{verdict_table}

**Post-Evidence Trust Scores:**

| Peer | Overall Trust | Verdict |
|------|--------------|---------|
| node_A | {score_a.overall_trust():.3f} | {"Probationary" if score_a.overall_trust() <= 0.5 else "Trusted"} -- no negative evidence |
| node_B | {score_b.overall_trust():.3f} | {"Probationary" if score_b.overall_trust() <= 0.5 else "Trusted"} -- {len(evidence_log)} evidence events filed |

**Interpretation:** Conflicts between nodes degrade trust for the conflicting peer. The trust lattice records evidence so downstream consumers can make trust-aware merge decisions (e.g., reject merges from peers below a trust threshold).
"""
        return md

    except Exception as e:
        return f"\n\n---\n### E4 Trust Layer -- Conflict Evidence\n\nE4 trust module unavailable: {e}\n"


# -----------------------------------------------------------------
# TAB 3 -- Core CRDT Primitives
# -----------------------------------------------------------------

def run_primitives_demo():
    from crdt_merge.core import GCounter, PNCounter, LWWRegister, ORSet

    results = {}

    # GCounter
    gc_a = GCounter()
    gc_a.increment("node_A", 5)
    gc_a.increment("node_A", 3)
    gc_b = GCounter()
    gc_b.increment("node_B", 7)
    gc_merged_ab = gc_a.merge(gc_b)
    gc_merged_ba = gc_b.merge(gc_a)
    results["GCounter"] = {
        "node_A_ops": "gc_a.increment('node_A', 5); gc_a.increment('node_A', 3)  # value=8",
        "node_B_ops": "gc_b.increment('node_B', 7)  # value=7",
        "merge_AB_value": gc_merged_ab.value,
        "merge_BA_value": gc_merged_ba.value,
        "commutative": gc_merged_ab.value == gc_merged_ba.value,
    }

    # PNCounter
    pn_a = PNCounter()
    pn_a.increment("n", 10)
    pn_a.decrement("n", 3)
    pn_b = PNCounter()
    pn_b.increment("n", 5)
    pn_merged_ab = pn_a.merge(pn_b)
    pn_merged_ba = pn_b.merge(pn_a)
    results["PNCounter"] = {
        "node_A_ops": "pn_a.increment('n', 10); pn_a.decrement('n', 3)  # value=7",
        "node_B_ops": "pn_b.increment('n', 5)  # value=5",
        "merge_AB_value": pn_merged_ab.value,
        "merge_BA_value": pn_merged_ba.value,
        "commutative": pn_merged_ab.value == pn_merged_ba.value,
    }

    # LWWRegister
    lww_a = LWWRegister()
    lww_a.set("model_v1", timestamp=1.0)
    lww_a.set("model_v2", timestamp=3.0)
    lww_b = LWWRegister()
    lww_b.set("model_v3", timestamp=2.0)
    lww_merged_ab = lww_a.merge(lww_b)
    lww_merged_ba = lww_b.merge(lww_a)
    results["LWWRegister"] = {
        "node_A_ops": "lww_a.set('model_v1', timestamp=1.0); lww_a.set('model_v2', timestamp=3.0)",
        "node_B_ops": "lww_b.set('model_v3', timestamp=2.0)",
        "merge_AB_value": str(lww_merged_ab.value),
        "merge_BA_value": str(lww_merged_ba.value),
        "commutative": str(lww_merged_ab.value) == str(lww_merged_ba.value),
    }

    # ORSet
    orset_a = ORSet()
    orset_a.add("alpha")
    orset_a.add("beta")
    tag_beta = orset_a.add("gamma")
    orset_b = ORSet()
    orset_b.add("beta")
    orset_b.add("delta")
    orset_merged_ab = orset_a.merge(orset_b)
    orset_merged_ba = orset_b.merge(orset_a)
    results["ORSet"] = {
        "node_A_ops": "orset_a.add('alpha'); orset_a.add('beta'); orset_a.add('gamma')",
        "node_B_ops": "orset_b.add('beta'); orset_b.add('delta')",
        "merge_AB_value": str(sorted(orset_merged_ab.value)),
        "merge_BA_value": str(sorted(orset_merged_ba.value)),
        "commutative": sorted(orset_merged_ab.value) == sorted(orset_merged_ba.value),
    }

    rows = []
    for name, data in results.items():
        rows.append({
            "Primitive": name,
            "Node A Operations": data["node_A_ops"],
            "Node B Operations": data["node_B_ops"],
            "merge(A,B) Value": str(data["merge_AB_value"]),
            "merge(B,A) Value": str(data["merge_BA_value"]),
            "Commutative": "PASS" if data["commutative"] else "FAIL",
        })

    return rows


def _e4_primitives_trust():
    """Run E4 trust primitives alongside core CRDTs. Returns markdown string."""
    try:
        from crdt_merge.e4 import TypedTrustScore, FrozenDict
        from crdt_merge.e4.delta_trust_lattice import DeltaTrustLattice
        from crdt_merge.e4.trust_bound_merkle import TrustBoundMerkle
        from crdt_merge.e4.causal_trust_clock import CausalTrustClock
        from crdt_merge.e4.pco import AggregateProofCarryingOperation

        # CausalTrustClock demo
        clock_a = CausalTrustClock(peer_id="node_A")
        clock_b = CausalTrustClock(peer_id="node_B")

        # Simulate operations on each clock
        clock_a = clock_a.increment()  # op 1
        clock_a = clock_a.increment()  # op 2
        clock_a = clock_a.increment()  # op 3
        clock_b = clock_b.increment()  # op 1
        clock_b = clock_b.increment()  # op 2

        clock_a_time = clock_a.logical_time
        clock_b_time = clock_b.logical_time

        # Merge clocks
        clock_merged = clock_a.merge(clock_b)
        clock_merged_time = clock_merged.logical_time

        # Trust-Bound Merkle tree wrapping primitive operations
        lattice = DeltaTrustLattice(peer_id="node_A")
        merkle = TrustBoundMerkle(trust_lattice=lattice)

        ops = [
            ("gcounter_inc_A", b"increment(node_A, 5)", "node_A"),
            ("gcounter_inc_B", b"increment(node_B, 7)", "node_B"),
            ("pncounter_inc", b"increment(n, 10)", "node_A"),
            ("pncounter_dec", b"decrement(n, 3)", "node_A"),
            ("lww_set_v1", b"set(model_v1, ts=1.0)", "node_A"),
            ("lww_set_v3", b"set(model_v3, ts=2.0)", "node_B"),
            ("orset_add_alpha", b"add(alpha)", "node_A"),
            ("orset_add_delta", b"add(delta)", "node_B"),
        ]

        for key, data, orig in ops:
            merkle.insert_leaf(key=key, data=data, originator=orig)

        merkle_root = merkle.recompute()

        # PCO wire format
        pco = AggregateProofCarryingOperation(
            aggregate_hash=b'\x00' * 32,
            signature=b'\x00' * 64,
            originator_id="node_A",
            metadata=b'{"ops": 8}',
            merkle_root_at_creation=str(merkle_root),
            clock_snapshot=b'\x03',
            trust_vector_hash="tvh_demo",
            delta_bounds=(),
        )
        wire = pco.to_wire()
        wire_size = len(wire)

        md = f"""
---
### E4 Trust Layer -- Primitive-Level Trust

#### CausalTrustClock

| Clock | Operations | Logical Time |
|-------|-----------|-------------|
| node_A | 3 increments | {clock_a_time} |
| node_B | 2 increments | {clock_b_time} |
| merged(A, B) | merge | {clock_merged_time} |

Causal trust clocks are immutable -- each `increment()` returns a new clock instance. The merged clock captures the causal frontier of both peers.

#### Trust-Bound Merkle Tree

| Property | Value |
|----------|-------|
| Leaves inserted | {len(ops)} |
| Operations covered | GCounter, PNCounter, LWWRegister, ORSet |
| Merkle root | `{merkle_root}` |

Every CRDT operation is recorded as a Merkle leaf with its originator. The trust-bound Merkle tree links each leaf to the originator's trust score in the lattice, enabling per-operation provenance auditing.

#### Proof-Carrying Operation (PCO) Wire Format

| Property | Value |
|----------|-------|
| Wire size | {wire_size} bytes |
| Originator | node_A |
| Merkle root at creation | `{str(merkle_root)[:32]}...` |
| Format | AggregateProofCarryingOperation |

The PCO bundles a cryptographic proof (aggregate hash + signature), the Merkle root at time of creation, and a clock snapshot into a compact wire format suitable for gossip protocols.
"""
        return md

    except Exception as e:
        return f"\n\n---\n### E4 Trust Layer -- Primitive-Level Trust\n\nE4 trust module unavailable: {e}\n"



# -----------------------------------------------------------------
# Gradio UI
# -----------------------------------------------------------------

with gr.Blocks(theme=THEME, css=CSS, title="crdt-merge — Data Playground") as demo:
    gr.Markdown(NAV_MD)
    gr.Markdown(HERO_MD)

    with gr.Tabs():

        # -- TAB 1 --------------------------------------------------------
        with gr.Tab("Dataset Merge"):
            gr.Markdown("""
## Dataset Merge

Loads glue/sst2 from HuggingFace datasets (first 200 rows) or uses synthetic fallback.
Splits into two node partitions with 50 overlapping records.
Demonstrates conflict-free merge with configurable strategy.

> **E4 Trust Scoring Active (v0.9.5+):** All merge operations now carry typed trust scores by default. Every record merge accumulates accuracy, consistency, recency, and provenance trust dimensions via GCounter-backed convergent accumulators. Trust propagation adds zero API overhead -- it activates transparently on `import crdt_merge`.
""")

            with gr.Row():
                strat_dd = gr.Dropdown(
                    choices=STRATEGIES_DF,
                    value="LWW",
                    label="Merge Strategy",
                    info="LWW = Last Write Wins (by timestamp). MaxWins/MinWins = field max/min. Union = set union.",
                )
                merge_ds_btn = gr.Button("Run Dataset Merge", variant="primary")

            merge_summary_md = gr.Markdown()
            merge_result_table = gr.Dataframe(
                headers=["id", "sentence", "label", "_ts"],
                label="Merged Records (first 20 rows)",
                wrap=True,
            )

            def _run_ds_merge(strategy):
                rows, summary = run_dataset_merge(strategy)
                df_data = [[r.get("id", ""), r.get("sentence", ""), r.get("label", ""), r.get("_ts", "")] for r in rows]
                return summary, df_data

            merge_ds_btn.click(_run_ds_merge, inputs=[strat_dd], outputs=[merge_summary_md, merge_result_table])
            demo.load(lambda: _run_ds_merge("LWW"), outputs=[merge_summary_md, merge_result_table])

        # -- TAB 2 --------------------------------------------------------
        with gr.Tab("Conflict Analysis"):
            gr.Markdown("""
## Conflict Analysis

Runs the same dataset through all 4 strategies and computes per-field conflict rates
between strategy pairs. The heatmap shows how often two strategies disagree on a record.

### How to Read the Results

- **Conflict Rate Heatmap:** Each cell shows the fraction of overlapping records where two strategies produce **different values** for a given field. Brighter = more disagreement. The diagonal is always 0 (a strategy agrees with itself).
  - `sentence:LWW` vs `sentence:MaxWins` = "how often do LWW and MaxWins disagree on the sentence field?"
  - High conflict rates between strategies mean the choice of strategy materially affects the merged output.
- **Comparison Table:** Shows how each strategy differs from LWW (the baseline). `0 conflicts` = identical behavior for this dataset. Higher numbers indicate the strategy resolves more records differently.
- **Why this matters:** In production systems, teams need to understand which strategy is appropriate for their data. If all strategies agree, the choice doesn't matter. If they diverge significantly, the strategy selection is a critical design decision.
""")

            with gr.Row():
                conflict_btn = gr.Button("Run Conflict Analysis", variant="primary")

            conflict_chart = gr.Plot(label="Per-Field Conflict Matrix Heatmap")
            conflict_table = gr.Dataframe(
                headers=["Strategy", "Conflicts vs LWW", "Overlap Records", "Conflict Rate"],
                label="Strategy Comparison",
            )
            conflict_e4_md = gr.Markdown()

            def _run_conflict():
                rows, fig = run_conflict_analysis()
                df_data = [
                    [r["Strategy"], r["Conflicts vs LWW"], r["Overlap Records"], r["Conflict Rate"]]
                    for r in rows
                ]
                e4_md = _e4_conflict_trust_analysis()
                return fig, df_data, e4_md

            conflict_btn.click(_run_conflict, outputs=[conflict_chart, conflict_table, conflict_e4_md])
            demo.load(_run_conflict, outputs=[conflict_chart, conflict_table, conflict_e4_md])

        # -- TAB 3 --------------------------------------------------------
        with gr.Tab("Core CRDT Primitives"):
            gr.Markdown("""
## Core CRDT Primitives

Live demonstration of GCounter, PNCounter, LWWRegister, and ORSet.
Each primitive is operated on two nodes independently, then merged in both directions.
Commutativity is verified: merge(A,B) must equal merge(B,A).

### How to Read the Results

| Primitive | What It Does | Merge Semantics |
|---|---|---|
| **GCounter** | Grow-only counter | Each node's count is tracked separately. Merge takes the **max per node**, then sums. Node A=8 + Node B=7 → merged=15. |
| **PNCounter** | Increment/decrement counter | Two internal GCounters (positive + negative). Merge takes max per node for each. Net value = positives − negatives. |
| **LWWRegister** | Last-Writer-Wins register | Stores a single value + timestamp. Merge keeps the value with the **latest timestamp**. Node A writes "model_v2" at t=3.0 > Node B's t=2.0, so A wins. |
| **ORSet** | Observed-Remove Set | Add/remove elements with unique tags. Merge is the **union** of all adds minus confirmed removes. Both nodes' elements appear in the merged set. |

- **merge(A,B) = merge(B,A):** The "Commutative" column proves this. PASS means the primitive is safe for distributed use — merge order doesn't affect the result.
- These are the building blocks that power crdt-merge's higher-level DataFrame and model merge operations.
""")

            with gr.Row():
                prim_btn = gr.Button("Run Primitives Demo", variant="primary")

            prim_table = gr.Dataframe(
                headers=["Primitive", "Node A Operations", "Node B Operations",
                         "merge(A,B) Value", "merge(B,A) Value", "Commutative"],
                label="Primitive Commutativity Proof",
                wrap=True,
            )
            prim_e4_md = gr.Markdown()

            def _run_prims():
                rows = run_primitives_demo()
                table_data = [
                    [r["Primitive"], r["Node A Operations"], r["Node B Operations"],
                     r["merge(A,B) Value"], r["merge(B,A) Value"], r["Commutative"]]
                    for r in rows
                ]
                e4_md = _e4_primitives_trust()
                return table_data, e4_md

            prim_btn.click(_run_prims, outputs=[prim_table, prim_e4_md])
            demo.load(_run_prims, outputs=[prim_table, prim_e4_md])

    gr.Markdown("""
---

**crdt-merge v0.9.5** · Patent UK 2607132.4, GB2608127.3 · E4 Trust-Delta · BUSL-1.1 → Apache 2.0 (2028-03-29)

[🏠 Flagship](https://huggingface.co/spaces/optitransfer/crdt-merge) · [🔬 Data Playground](https://huggingface.co/spaces/optitransfer/crdt-merge-data) · [🌐 Federation](https://huggingface.co/spaces/optitransfer/crdt-merge-federation) · [GitHub](https://github.com/mgillr/crdt-merge) · [⭐ Star Repo](https://github.com/mgillr/crdt-merge/stargazers) · [👁️ Watch](https://github.com/mgillr/crdt-merge/subscription) · [📐 Architecture Deep Dive](https://github.com/mgillr/crdt-merge/tree/main/docs/architecture) · [PyPI](https://pypi.org/project/crdt-merge/) · `pip install crdt-merge`
""")

if __name__ == "__main__":
    demo.launch()