Spaces:

Praneshrajan15
/

dataforge-playground

Running

App Files Files Community

Praneshrajan15 commited on 1 day ago

Commit

eed1cab

verified ·

1 Parent(s): fe6681f

Deploy DataForge playground API

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +63 -28
README.md +21 -18
README_MAIN.md +0 -96
dataforge/__init__.py +115 -2
dataforge/agent/__init__.py +16 -1
dataforge/agent/providers.py +11 -3
dataforge/agent/scratchpad.py +183 -0
dataforge/agent/tool_actions.py +343 -0
dataforge/bench/core.py +6 -0
dataforge/bench/groq_client.py +306 -27
dataforge/bench/methods.py +35 -0
dataforge/bench/report.py +19 -13
dataforge/bench/runner.py +45 -6
dataforge/causal/__init__.py +21 -1
dataforge/causal/dag.py +174 -0
dataforge/causal/pc.py +232 -0
dataforge/causal/root_cause.py +193 -0
dataforge/cli/__init__.py +10 -4
dataforge/cli/audit.py +70 -0
dataforge/cli/bench.py +23 -4
dataforge/cli/common.py +26 -4
dataforge/cli/profile.py +61 -16
dataforge/cli/release.py +39 -0
dataforge/cli/repair.py +104 -249
dataforge/cli/watch.py +142 -0
dataforge/datasets/embedded/hospital/clean.csv +11 -0
dataforge/datasets/embedded/hospital/dirty.csv +11 -0
dataforge/datasets/real_world.py +37 -7
dataforge/detectors/__init__.py +2 -4
dataforge/detectors/base.py +5 -5
dataforge/detectors/decimal_shift.py +11 -17
dataforge/detectors/fd_violation.py +21 -24
dataforge/detectors/type_mismatch.py +6 -13
dataforge/engine/__init__.py +33 -1
dataforge/engine/repair.py +670 -0
dataforge/env/__init__.py +22 -1
dataforge/env/environment.py +884 -0
dataforge/env/observation.py +61 -0
dataforge/env/openenv_core.py +146 -0
dataforge/env/reward.py +128 -0
dataforge/env/server.py +175 -0
dataforge/evaluation_contract.py +76 -0
dataforge/fixtures/hospital_10rows.csv +11 -0
dataforge/fixtures/hospital_schema.yaml +17 -0
dataforge/http/__init__.py +1 -0
dataforge/http/problem.py +99 -0
dataforge/integrations/dbt.py +1 -0
dataforge/observability.py +76 -0
dataforge/py.typed +1 -0
dataforge/release/__init__.py +2 -0

Dockerfile CHANGED Viewed

@@ -1,28 +1,63 @@
-# DataForge Playground - Multi-stage Docker build for HF Spaces.
-FROM python:3.12-slim AS builder
-WORKDIR /build
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends gcc g++ && \
-    rm -rf /var/lib/apt/lists/*
-COPY playground/api/requirements.txt /build/requirements.txt
-RUN pip install --no-cache-dir -r /build/requirements.txt
-COPY pyproject.toml /build/dataforge_src/pyproject.toml
-COPY README_MAIN.md /build/dataforge_src/README.md
-COPY dataforge/ /build/dataforge_src/dataforge/
-COPY constitutions/ /build/dataforge_src/constitutions/
-RUN pip install --no-cache-dir /build/dataforge_src
-FROM python:3.12-slim
-RUN useradd -m -u 1000 user
-COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages
-COPY --from=builder /usr/local/bin /usr/local/bin
-COPY --from=builder /build/dataforge_src/constitutions /usr/local/lib/python3.12/site-packages/constitutions
-COPY playground/api/app.py /home/user/app/app.py
-COPY playground/api/samples/ /home/user/app/samples/
-COPY playground/web/ /home/user/app/web/
-USER user
-WORKDIR /home/user/app
-EXPOSE 7860
-ENV PORT=7860
-ENV DATAFORGE_PLAYGROUND_DEV=0
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "5"]

+# DataForge Playground — Multi-stage Docker build for HF Spaces.
+#
+# Target: <= 600 MB image. Runs as non-root UID 1000 (HF requirement).
+# Single-worker uvicorn with --timeout-keep-alive 5 (slowloris mitigation).
+#
+# See specs/SPEC_playground.md §4 and §6.5.
+# ============================================================
+# Stage 1: builder — install all Python dependencies
+# ============================================================
+FROM python:3.12-slim AS builder
+WORKDIR /build
+# System deps for building wheels
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends gcc g++ && \
+    rm -rf /var/lib/apt/lists/*
+# Install playground API requirements
+COPY playground/api/requirements.txt /build/requirements.txt
+RUN pip install --no-cache-dir -r /build/requirements.txt
+# Copy dataforge source and install it
+COPY pyproject.toml /build/dataforge_src/pyproject.toml
+COPY README.md /build/dataforge_src/README.md
+COPY dataforge/ /build/dataforge_src/dataforge/
+COPY constitutions/ /build/dataforge_src/constitutions/
+RUN pip install --no-cache-dir /build/dataforge_src
+# ============================================================
+# Stage 2: runtime — minimal image with only installed packages
+# ============================================================
+FROM python:3.12-slim
+# HF Spaces requires non-root user with UID 1000
+RUN useradd -m -u 1000 user
+# Copy installed Python packages from builder
+COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+# Copy constitutions to the site-packages-relative path SafetyFilter expects.
+COPY --from=builder /build/dataforge_src/constitutions /usr/local/lib/python3.12/site-packages/constitutions
+# Copy application code
+COPY playground/api/app.py /home/user/app/app.py
+COPY playground/api/samples/ /home/user/app/samples/
+# Switch to non-root user
+USER user
+WORKDIR /home/user/app
+# Expose the port HF Spaces expects
+EXPOSE 7860
+# Environment
+ENV PORT=7860
+ENV DATAFORGE_PLAYGROUND_DEV=0
+# Start uvicorn with single worker (slowapi in-memory limiter contract)
+# and honor PORT for Hugging Face runtime assignment.
+CMD ["sh", "-c", "uvicorn app:app --host 0.0.0.0 --port ${PORT:-7860} --workers 1 --timeout-keep-alive 5"]

README.md CHANGED Viewed

@@ -7,37 +7,40 @@ sdk: docker
 app_port: 7860
 pinned: false
 license: apache-2.0
-short_description: Upload a CSV, profile and dry-run-repair it in your browser.
 ---
-# DataForge Playground
-Upload a CSV file and instantly profile it for data-quality issues or
-preview proposed repairs — all in your browser, no installation required.
-**What it does:**
-- **Profile**: Detects type mismatches, decimal shifts, and functional
-  dependency violations using heuristic detectors.
-- **Repair (Dry Run)**: Proposes fixes through the full Safety → Verifier →
-  Transaction pipeline, returning an ephemeral transaction journal.
-**What it does NOT do:**
-- No data is persisted. Your file is processed in memory and discarded.
-- No cookies, no analytics of file contents.
-- No LLM calls by default (opt-in only, requires a configured key).
-## Run locally instead
 ```bash
-pip install dataforge
-dataforge profile your_data.csv
-dataforge repair your_data.csv --dry-run
 ```
 ## Source
-- Main repository: [github.com/Praneshrajan15/data-quality-env](https://github.com/Praneshrajan15/data-quality-env)
 - Spec: `specs/SPEC_playground.md`
 - License: Apache-2.0

 app_port: 7860
 pinned: false
 license: apache-2.0
+short_description: Profile CSVs and dry-run safe repairs.
 ---
+# DataForge Playground API
+This is the API backend for the DataForge playground. The browser UI is deployed
+separately through Cloudflare Workers Static Assets; this Hugging Face Docker
+Space serves stateless CSV profiling and dry-run repair endpoints.
+## What It Does
+- Profile: detects type mismatches, decimal shifts, and functional dependency
+  violations.
+- Repair dry run: proposes fixes through SafetyFilter -> SMTVerifier and
+  returns an ephemeral transaction receipt without persisting user data.
+- Samples: serves small deterministic CSV examples for the static frontend.
+## What It Does Not Do
+- It does not persist uploaded files.
+- It does not use cookies or analytics for file contents.
+- It does not call an LLM by default.
+- It does not perform autonomous production repair.
+## Run Locally
 ```bash
+python -m pip install -e ".[dev]"
+pip install -r playground/api/requirements.txt
+uvicorn playground.api.app:app --reload --port 7860
 ```
 ## Source
+- Main repository: `github.com/Praneshrajan15/data-quality-env`
 - Spec: `specs/SPEC_playground.md`
 - License: Apache-2.0

README_MAIN.md DELETED Viewed

@@ -1,96 +0,0 @@
-# DataForge
-DataForge currently ships a real Week 3 CLI for CSV profiling and repair.
-This repository now includes shipped detectors, deterministic repairers,
-constitutional safety gating, SMT-backed structural verification, reversible
-transaction logs, and real-world benchmark infrastructure. The hosted
-playground, warehouse integrations, and trained model family remain future
-work.
-## Current Status
-- `dataforge profile`, `dataforge repair`, `dataforge revert`, and `dataforge bench`
-- Three shipped detectors: `type_mismatch`, `decimal_shift`, `fd_violation`
-- Three shipped repairers with safety + verifier gating in the apply path
-- Reversible transaction logs with byte-identical revert via source snapshots
-- Benchmark/report generation infrastructure for Hospital / Flights / Beers
-- `Makefile` targets for setup, lint, type-checking, and tests
-- CI plus unit / integration / property / adversarial coverage
-## Benchmark Results
-<!-- BENCH:START -->
-Generated from `eval/results/agent_comparison.json`.
-| Method | Precision | Recall | F1 | Avg Steps | Quota Units |
-| --- | --- | --- | --- | --- | --- |
-| heuristic | 0.0000 | 0.0000 | 0.0000 | 134.33 | 0.0000 |
-| llm_react | Skipped | Skipped | Skipped | Skipped | Skipped |
-| llm_zeroshot | Skipped | Skipped | Skipped | Skipped | Skipped |
-| random | 0.0038 | 0.0003 | 0.0005 | 150.33 | 0.0000 |
-See `BENCHMARK_REPORT.md` for per-dataset tables, error bars, and citation-only SOTA rows.
-Skipped methods in this run: DATAFORGE_LLM_PROVIDER must be set to groq.
-<!-- BENCH:END -->
-## Local Setup
-```bash
-make setup
-make lint
-make type
-make test
-```
-Verification works on Linux, macOS, or Windows (with Git Bash as the
-shell substrate for GNU Make). Requires Python 3.11 or 3.12
-(`requires-python = ">=3.11,<3.13"`).
-### Windows-specific setup
-```powershell
-# Install Python 3.12 and GNU Make if not present
-winget install -e --id Python.Python.3.12
-winget install -e --id ezwinports.make
-# Create and activate a project venv
-py -3.12 -m venv .venv
-.\.venv\Scripts\Activate.ps1
-# Install dependencies and verify
-python -m pip install -e ".[all]"
-make lint && make type && make test
-```
-Git for Windows provides the Bash implementation the Makefile uses on Windows.
-Do not rely on `C:\Windows\System32\bash.exe` (WSL).
-## Environment Variables
-Future provider keys belong in a root `.env` file that is gitignored and meant
-to be loaded with `python-dotenv`.
-- `GROQ_API_KEY`
-- `GEMINI_API_KEY`
-- `CEREBRAS_API_KEY`
-- `OPENROUTER_API_KEY`
-- `HF_TOKEN`
-## Repository Docs
-- [.cursor/rules/dataforge.md](.cursor/rules/dataforge.md) — always-applied rules
-- [ARCHITECTURE.md](ARCHITECTURE.md) — system diagram and dependency justification
-- [DECISIONS.md](DECISIONS.md) — technical decision log
-- [CONTRIBUTING.md](CONTRIBUTING.md) — workflow and code standards
-- [CLAUDE.md](CLAUDE.md) — living knowledge base for Cursor sessions
-- [CURSOR_MASTER.md](CURSOR_MASTER.md) — full context and prompt pack
-- [META_CONTEXT.md](META_CONTEXT.md) — meta-context (read before writing code)
-- [FILE_STRUCTURE.md](FILE_STRUCTURE.md) — canonical target directory tree
-- [SECURITY.md](SECURITY.md) — vulnerability reporting policy
-- [specs/SPEC_TEMPLATE.md](specs/SPEC_TEMPLATE.md) — spec template for new modules
-## License
-Apache-2.0. See [LICENSE](LICENSE).

dataforge/__init__.py CHANGED Viewed

@@ -1,5 +1,118 @@
-"""DataForge public package."""
-__all__ = ["__version__"]
 __version__ = "0.1.0"

+"""DataForge public package.
+The root package is the stable facade for integration surfaces. Symbols are
+resolved lazily so importing :mod:`dataforge` does not eagerly import pandas,
+FastAPI-facing helpers, or the SMT stack.
+"""
+from __future__ import annotations
+from importlib import import_module
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from dataforge.cli.common import load_schema, read_csv, schema_from_mapping
+    from dataforge.detectors import Issue, Schema, Severity, run_all_detectors
+    from dataforge.engine.repair import (
+        CandidateFix,
+        RepairFailure,
+        RepairPipelineRequest,
+        RepairPipelineResult,
+        RepairReceipt,
+        VerifiedFix,
+        run_repair_pipeline,
+    )
+    from dataforge.repair_contract import CONTRACT_VERSION
+    from dataforge.repairers import ProposedFix
+    from dataforge.safety import SafetyContext, SafetyFilter, SafetyResult, SafetyVerdict
+    from dataforge.transactions.log import (
+        TransactionAuditReport,
+        TransactionAuditVerdict,
+        TransactionLogError,
+        verify_transaction_log,
+    )
+    from dataforge.transactions.revert import TransactionRevertError, revert_transaction
+    from dataforge.transactions.txn import CellFix, RepairTransaction
+    from dataforge.verifier import SMTVerifier, VerificationResult, VerificationVerdict
+__all__ = [
+    "CONTRACT_VERSION",
+    "CandidateFix",
+    "CellFix",
+    "Issue",
+    "ProposedFix",
+    "RepairFailure",
+    "RepairPipelineRequest",
+    "RepairPipelineResult",
+    "RepairReceipt",
+    "RepairTransaction",
+    "SMTVerifier",
+    "SafetyContext",
+    "SafetyFilter",
+    "SafetyResult",
+    "SafetyVerdict",
+    "Schema",
+    "Severity",
+    "TransactionAuditReport",
+    "TransactionAuditVerdict",
+    "TransactionLogError",
+    "TransactionRevertError",
+    "VerificationResult",
+    "VerificationVerdict",
+    "VerifiedFix",
+    "__version__",
+    "load_schema",
+    "read_csv",
+    "revert_transaction",
+    "run_all_detectors",
+    "run_repair_pipeline",
+    "schema_from_mapping",
+    "verify_transaction_log",
+]
 __version__ = "0.1.0"
+_PUBLIC_EXPORTS: dict[str, tuple[str, str]] = {
+    "CONTRACT_VERSION": ("dataforge.repair_contract", "CONTRACT_VERSION"),
+    "CandidateFix": ("dataforge.engine.repair", "CandidateFix"),
+    "CellFix": ("dataforge.transactions.txn", "CellFix"),
+    "Issue": ("dataforge.detectors", "Issue"),
+    "ProposedFix": ("dataforge.repairers", "ProposedFix"),
+    "RepairFailure": ("dataforge.engine.repair", "RepairFailure"),
+    "RepairPipelineRequest": ("dataforge.engine.repair", "RepairPipelineRequest"),
+    "RepairPipelineResult": ("dataforge.engine.repair", "RepairPipelineResult"),
+    "RepairReceipt": ("dataforge.engine.repair", "RepairReceipt"),
+    "RepairTransaction": ("dataforge.transactions.txn", "RepairTransaction"),
+    "SMTVerifier": ("dataforge.verifier", "SMTVerifier"),
+    "SafetyContext": ("dataforge.safety", "SafetyContext"),
+    "SafetyFilter": ("dataforge.safety", "SafetyFilter"),
+    "SafetyResult": ("dataforge.safety", "SafetyResult"),
+    "SafetyVerdict": ("dataforge.safety", "SafetyVerdict"),
+    "Schema": ("dataforge.detectors", "Schema"),
+    "Severity": ("dataforge.detectors", "Severity"),
+    "TransactionAuditReport": ("dataforge.transactions.log", "TransactionAuditReport"),
+    "TransactionAuditVerdict": ("dataforge.transactions.log", "TransactionAuditVerdict"),
+    "TransactionLogError": ("dataforge.transactions.log", "TransactionLogError"),
+    "TransactionRevertError": ("dataforge.transactions.revert", "TransactionRevertError"),
+    "VerificationResult": ("dataforge.verifier", "VerificationResult"),
+    "VerificationVerdict": ("dataforge.verifier", "VerificationVerdict"),
+    "VerifiedFix": ("dataforge.engine.repair", "VerifiedFix"),
+    "load_schema": ("dataforge.cli.common", "load_schema"),
+    "read_csv": ("dataforge.cli.common", "read_csv"),
+    "revert_transaction": ("dataforge.transactions.revert", "revert_transaction"),
+    "run_all_detectors": ("dataforge.detectors", "run_all_detectors"),
+    "run_repair_pipeline": ("dataforge.engine.repair", "run_repair_pipeline"),
+    "schema_from_mapping": ("dataforge.cli.common", "schema_from_mapping"),
+    "verify_transaction_log": ("dataforge.transactions.log", "verify_transaction_log"),
+}
+def __getattr__(name: str) -> Any:
+    """Resolve public facade exports on first use."""
+    try:
+        module_name, attribute_name = _PUBLIC_EXPORTS[name]
+    except KeyError as exc:
+        raise AttributeError(name) from exc
+    value = getattr(import_module(module_name), attribute_name)
+    globals()[name] = value
+    return value

dataforge/agent/__init__.py CHANGED Viewed

	@@ -1 +1,16 @@
1	- """~~Agent~~ package ~~scaffolding~~ ~~for~~ ~~DataForge~~.~~"""~~

+"""DataForge agent package — typed tool-use actions and scratchpad.
+Public API:
+    parse_action — Parse raw dict into typed Action model.
+    Action       — Discriminated union of all action types.
+    Scratchpad   — In-episode hypothesis tracker.
+"""
+from dataforge.agent.scratchpad import Scratchpad
+from dataforge.agent.tool_actions import Action, parse_action
+__all__ = [
+    "Action",
+    "Scratchpad",
+    "parse_action",
+]

dataforge/agent/providers.py CHANGED Viewed

@@ -59,8 +59,9 @@ def get_provider_name() -> str:
     """Read the active provider from the environment.
     Returns:
-        The lowercased provider name from ``DATAFORGE_LLM_PROVIDER``,
-        defaulting to ``"groq"`` if not set.
     Example:
         >>> import os
@@ -68,7 +69,14 @@ def get_provider_name() -> str:
         >>> get_provider_name()
         'gemini'
     """
-    return os.environ.get("DATAFORGE_LLM_PROVIDER", "groq").lower()
 async def complete(

     """Read the active provider from the environment.
     Returns:
+        The lowercased provider name from ``DATAFORGE_LLM_PROVIDER``.
+        When no explicit provider is configured, prefer a provider whose
+        credential is present in the environment.
     Example:
         >>> import os
         >>> get_provider_name()
         'gemini'
     """
+    configured = os.environ.get("DATAFORGE_LLM_PROVIDER")
+    if configured:
+        return configured.lower()
+    if os.environ.get("GROQ_API_KEY"):
+        return "groq"
+    if os.environ.get("GEMINI_API_KEY"):
+        return "gemini"
+    return "groq"
 async def complete(

dataforge/agent/scratchpad.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""In-episode hypothesis and issue tracker for the DataForge RL agent.
+The scratchpad is a mutable, episode-scoped data structure that the agent
+uses to record hypotheses, confirmed issues, and dead ends. The environment
+exposes a compact summary of the scratchpad in each observation, enabling
+the agent to reason about its investigation history without direct access
+to the underlying data structure.
+Example::
+    >>> from dataforge.agent.scratchpad import Scratchpad
+    >>> pad = Scratchpad()
+    >>> pad.add_hypothesis("Rating column has decimal shift", [5], ["rating"], "decimal_shift")
+    >>> pad.confirm_issue(5, "rating", "decimal_shift")
+    >>> pad.summary()
+    'Hypotheses: 1 (0 pending). Confirmed: 1. Dead ends: 0.'
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+__all__ = [
+    "ConfirmedIssue",
+    "DeadEnd",
+    "HypothesisRecord",
+    "Scratchpad",
+]
+@dataclass(frozen=True)
+class HypothesisRecord:
+    """A recorded hypothesis about a data-quality root cause.
+    Args:
+        claim: Textual description of the hypothesis.
+        affected_rows: Row indices the hypothesis covers.
+        affected_columns: Column names the hypothesis covers.
+        root_cause_type: Detector-vocabulary root cause type.
+        confirmed: Whether the hypothesis was confirmed by ground truth.
+    """
+    claim: str
+    affected_rows: tuple[int, ...]
+    affected_columns: tuple[str, ...]
+    root_cause_type: str
+    confirmed: bool = False
+@dataclass(frozen=True)
+class ConfirmedIssue:
+    """A confirmed data-quality issue at a specific location.
+    Args:
+        row: Zero-indexed row number.
+        column: Column name.
+        issue_type: Issue type classification.
+    """
+    row: int
+    column: str
+    issue_type: str
+@dataclass(frozen=True)
+class DeadEnd:
+    """A recorded dead end — an investigation path that yielded nothing.
+    Args:
+        description: What was tried and why it failed.
+        step_number: Step at which the dead end was recorded.
+    """
+    description: str
+    step_number: int
+@dataclass
+class Scratchpad:
+    """Mutable in-episode tracker for hypotheses, confirmed issues, and dead ends.
+    Reset at the start of each episode. The ``summary()`` method produces a
+    compact string for inclusion in agent observations.
+    Example::
+        >>> pad = Scratchpad()
+        >>> pad.add_hypothesis("Decimal shift in rating", [5], ["rating"], "decimal_shift")
+        >>> len(pad.hypotheses)
+        1
+    """
+    hypotheses: list[HypothesisRecord] = field(default_factory=list)
+    confirmed_issues: list[ConfirmedIssue] = field(default_factory=list)
+    dead_ends: list[DeadEnd] = field(default_factory=list)
+    def add_hypothesis(
+        self,
+        claim: str,
+        affected_rows: list[int],
+        affected_columns: list[str],
+        root_cause_type: str,
+    ) -> HypothesisRecord:
+        """Record a new hypothesis.
+        Args:
+            claim: Textual description of the hypothesis.
+            affected_rows: Row indices the hypothesis covers.
+            affected_columns: Column names the hypothesis covers.
+            root_cause_type: Detector-vocabulary root cause type.
+        Returns:
+            The recorded hypothesis.
+        """
+        record = HypothesisRecord(
+            claim=claim,
+            affected_rows=tuple(affected_rows),
+            affected_columns=tuple(affected_columns),
+            root_cause_type=root_cause_type,
+        )
+        self.hypotheses.append(record)
+        return record
+    def confirm_hypothesis(self, index: int) -> None:
+        """Mark a hypothesis as confirmed.
+        Args:
+            index: Index into the ``hypotheses`` list.
+        Raises:
+            IndexError: If the index is out of range.
+        """
+        old = self.hypotheses[index]
+        self.hypotheses[index] = HypothesisRecord(
+            claim=old.claim,
+            affected_rows=old.affected_rows,
+            affected_columns=old.affected_columns,
+            root_cause_type=old.root_cause_type,
+            confirmed=True,
+        )
+    def confirm_issue(self, row: int, column: str, issue_type: str) -> None:
+        """Record a confirmed issue.
+        Args:
+            row: Zero-indexed row number.
+            column: Column name.
+            issue_type: Issue type classification.
+        """
+        self.confirmed_issues.append(ConfirmedIssue(row=row, column=column, issue_type=issue_type))
+    def add_dead_end(self, description: str, step_number: int) -> None:
+        """Record a dead end.
+        Args:
+            description: What was tried and why it failed.
+            step_number: Step at which the dead end was recorded.
+        """
+        self.dead_ends.append(DeadEnd(description=description, step_number=step_number))
+    def reset(self) -> None:
+        """Clear all tracked state for a new episode."""
+        self.hypotheses.clear()
+        self.confirmed_issues.clear()
+        self.dead_ends.clear()
+    def summary(self) -> str:
+        """Produce a compact summary string for observation embedding.
+        Returns:
+            A one-line summary of scratchpad state.
+        Example::
+            >>> Scratchpad().summary()
+            'Hypotheses: 0 (0 pending). Confirmed: 0. Dead ends: 0.'
+        """
+        pending = sum(1 for h in self.hypotheses if not h.confirmed)
+        return (
+            f"Hypotheses: {len(self.hypotheses)} ({pending} pending). "
+            f"Confirmed: {len(self.confirmed_issues)}. "
+            f"Dead ends: {len(self.dead_ends)}."
+        )

dataforge/agent/tool_actions.py ADDED Viewed

	@@ -0,0 +1,343 @@

+"""Typed tool-use action models for the DataForge RL environment.
+This module defines a discriminated union of 8 action types that an RL agent
+can submit to the DataForge environment. Each action is a standalone Pydantic
+model with its own validation rules, preventing cross-model field pollution.
+The ``parse_action`` function is the single entry point for HTTP handlers
+and tests to validate raw action dicts into typed models.
+Action Types:
+    INSPECT_ROWS  — View a slice of the dataset.
+    SQL_QUERY     — Execute read-only SQL against the episode DataFrame.
+    STAT_TEST     — Run a statistical test on a column.
+    PATTERN_MATCH — Evaluate a regex pattern against column values.
+    HYPOTHESIS    — Record a causal-root claim for credit.
+    ROOT_CAUSE    — Analyze selected detected errors for minimal roots.
+    DIAGNOSE      — Flag a suspected issue at (row, column).
+    FIX           — Propose a corrected value for a diagnosed issue.
+Example::
+    >>> from dataforge.agent.tool_actions import parse_action
+    >>> action = parse_action({"action_type": "INSPECT_ROWS", "row_indices": [0, 1]})
+    >>> action.action_type
+    'INSPECT_ROWS'
+"""
+from __future__ import annotations
+from typing import Annotated, Any, Literal
+from pydantic import BaseModel, Field, field_validator
+__all__ = [
+    "Action",
+    "Diagnose",
+    "Fix",
+    "Hypothesis",
+    "InspectRows",
+    "PatternMatch",
+    "RootCause",
+    "SqlQuery",
+    "StatTest",
+    "parse_action",
+]
+class InspectRows(BaseModel):
+    """View a slice of dataset rows.
+    Args:
+        action_type: Must be ``"INSPECT_ROWS"``.
+        row_indices: Zero-indexed row indices to retrieve. At least 1 required.
+        column_names: Optional column filter. If omitted, all columns returned.
+    Example::
+        >>> InspectRows(action_type="INSPECT_ROWS", row_indices=[0, 1, 2])
+    """
+    action_type: Literal["INSPECT_ROWS"]
+    row_indices: list[int] = Field(min_length=1, description="Row indices to inspect (0-indexed).")
+    column_names: list[str] | None = Field(default=None, description="Optional column filter.")
+    @field_validator("row_indices")
+    @classmethod
+    def _validate_row_indices(cls, v: list[int]) -> list[int]:
+        """Validate that all row indices are non-negative."""
+        if any(i < 0 for i in v):
+            raise ValueError("All row indices must be >= 0")
+        return v
+    model_config = {"frozen": True}
+class SqlQuery(BaseModel):
+    """Execute read-only SQL against the episode DataFrame via DuckDB.
+    Args:
+        action_type: Must be ``"SQL_QUERY"``.
+        query: SQL query string. Must be read-only (SELECT only).
+    Example::
+        >>> SqlQuery(action_type="SQL_QUERY", query="SELECT * FROM data LIMIT 5")
+    """
+    action_type: Literal["SQL_QUERY"]
+    query: str = Field(min_length=1, description="Read-only SQL query.")
+    model_config = {"frozen": True}
+class StatTest(BaseModel):
+    """Run a statistical test on a dataset column.
+    Args:
+        action_type: Must be ``"STAT_TEST"``.
+        test_type: One of ``"zscore"``, ``"iqr"``, ``"ks"``.
+        column: Column name to test.
+        threshold: Optional threshold override. Defaults vary by test type.
+    Example::
+        >>> StatTest(action_type="STAT_TEST", test_type="zscore", column="rating")
+    """
+    action_type: Literal["STAT_TEST"]
+    test_type: Literal["zscore", "iqr", "ks"] = Field(description="Statistical test to run.")
+    column: str = Field(min_length=1, description="Column name to test.")
+    threshold: float | None = Field(default=None, description="Optional threshold override.")
+    model_config = {"frozen": True}
+class PatternMatch(BaseModel):
+    """Evaluate a regex pattern against column values.
+    Args:
+        action_type: Must be ``"PATTERN_MATCH"``.
+        pattern: Regular expression string.
+        column: Column name to evaluate.
+        expect_match: If True, report rows that match. If False, report non-matches.
+    Example::
+        >>> PatternMatch(
+        ...     action_type="PATTERN_MATCH",
+        ...     pattern=r"^\\d{5}$",
+        ...     column="zip_code",
+        ... )
+    """
+    action_type: Literal["PATTERN_MATCH"]
+    pattern: str = Field(min_length=1, description="Regex pattern.")
+    column: str = Field(min_length=1, description="Column name to evaluate.")
+    expect_match: bool = Field(
+        default=True,
+        description="True to report matches, False to report non-matches.",
+    )
+    model_config = {"frozen": True}
+class Hypothesis(BaseModel):
+    """Record a causal-root claim for root-cause credit.
+    Args:
+        action_type: Must be ``"HYPOTHESIS"``.
+        claim: Textual description of the hypothesized root cause.
+        affected_rows: Row indices believed to be affected.
+        affected_columns: Column names believed to be affected.
+        root_cause_type: Detector-vocabulary root cause type
+            (e.g., ``"decimal_shift"``, ``"type_mismatch"``).
+    Example::
+        >>> Hypothesis(
+        ...     action_type="HYPOTHESIS",
+        ...     claim="Column 'rating' has a decimal shift at row 5",
+        ...     affected_rows=[5],
+        ...     affected_columns=["rating"],
+        ...     root_cause_type="decimal_shift",
+        ... )
+    """
+    action_type: Literal["HYPOTHESIS"]
+    claim: str = Field(min_length=1, description="Root-cause claim.")
+    affected_rows: list[int] = Field(min_length=1, description="Affected row indices.")
+    affected_columns: list[str] = Field(min_length=1, description="Affected column names.")
+    root_cause_type: str = Field(min_length=1, description="Detector-vocabulary root cause type.")
+    @field_validator("affected_rows")
+    @classmethod
+    def _validate_affected_rows(cls, v: list[int]) -> list[int]:
+        """Validate that all affected row indices are non-negative."""
+        if any(i < 0 for i in v):
+            raise ValueError("All affected row indices must be >= 0")
+        return v
+    model_config = {"frozen": True}
+class RootCause(BaseModel):
+    """Analyze selected detected errors for minimal causal roots.
+    Args:
+        action_type: Must be ``"ROOT_CAUSE"``.
+        error_indices: Zero-based indices into the episode's detected issue list.
+    Example::
+        >>> RootCause(action_type="ROOT_CAUSE", error_indices=[0, 1])
+    """
+    action_type: Literal["ROOT_CAUSE"]
+    error_indices: list[int] = Field(min_length=1, description="Detected issue indices.")
+    @field_validator("error_indices")
+    @classmethod
+    def _validate_error_indices(cls, v: list[int]) -> list[int]:
+        """Validate that all error indices are non-negative."""
+        if any(i < 0 for i in v):
+            raise ValueError("All error indices must be >= 0")
+        return v
+    model_config = {"frozen": True}
+class Diagnose(BaseModel):
+    """Flag a suspected data-quality issue at a specific (row, column).
+    Args:
+        action_type: Must be ``"DIAGNOSE"``.
+        row: Zero-indexed row number.
+        column: Column name.
+        issue_type: Issue type from detector vocabulary.
+    Example::
+        >>> Diagnose(
+        ...     action_type="DIAGNOSE",
+        ...     row=5, column="rating",
+        ...     issue_type="decimal_shift",
+        ... )
+    """
+    action_type: Literal["DIAGNOSE"]
+    row: int = Field(ge=0, description="Zero-indexed row number.")
+    column: str = Field(min_length=1, description="Column name.")
+    issue_type: str = Field(min_length=1, description="Issue type classification.")
+    model_config = {"frozen": True}
+class Fix(BaseModel):
+    """Propose a corrected value for a diagnosed issue.
+    Args:
+        action_type: Must be ``"FIX"``.
+        row: Zero-indexed row number.
+        column: Column name.
+        new_value: The corrected cell value as a string.
+        justification: Explanation of why this fix is correct.
+        fix_type: How to fix the issue. Defaults to ``"correct_value"``.
+    Example::
+        >>> Fix(
+        ...     action_type="FIX",
+        ...     row=5, column="rating",
+        ...     new_value="4.5",
+        ...     justification="Decimal shift: 45.0 should be 4.5",
+        ... )
+    """
+    action_type: Literal["FIX"]
+    row: int = Field(ge=0, description="Zero-indexed row number.")
+    column: str = Field(min_length=1, description="Column name.")
+    new_value: str = Field(description="Corrected cell value.")
+    justification: str = Field(min_length=1, description="Fix justification.")
+    fix_type: Literal["correct_value", "delete_row", "impute", "standardize"] = Field(
+        default="correct_value", description="Fix operation type."
+    )
+    model_config = {"frozen": True}
+# ═══════════════════════════════════════════════════════════════════════════
+# Discriminated union and parser
+# ═══════════════════════════════════════════════════════════════════════════
+Action = Annotated[
+    InspectRows | SqlQuery | StatTest | PatternMatch | Hypothesis | RootCause | Diagnose | Fix,
+    Field(discriminator="action_type"),
+]
+"""Discriminated union of all valid DataForge environment actions."""
+def parse_action(raw: dict[str, Any]) -> Action:
+    """Parse and validate a raw action dict into the appropriate typed model.
+    This is the single entry point for HTTP handlers and tests to validate
+    actions. The ``action_type`` field is used as the discriminator.
+    Args:
+        raw: Dictionary with an ``action_type`` key and action-specific fields.
+    Returns:
+        A validated action model instance.
+    Raises:
+        pydantic.ValidationError: If the action is malformed or invalid.
+        KeyError: If ``action_type`` is missing.
+        ValueError: If ``action_type`` is not recognized.
+    Example::
+        >>> action = parse_action({"action_type": "INSPECT_ROWS", "row_indices": [0]})
+        >>> isinstance(action, InspectRows)
+        True
+    """
+    from pydantic import TypeAdapter
+    adapter: TypeAdapter[Action] = TypeAdapter(Action)
+    return adapter.validate_python(_normalize_action(raw))
+def _normalize_action(raw: dict[str, Any]) -> dict[str, Any]:
+    """Return a canonical action dictionary from supported external aliases."""
+    normalized = dict(raw)
+    action_type = normalized.get("action_type")
+    if action_type == "SQL_QUERY" and "sql" in normalized and "query" not in normalized:
+        normalized["query"] = normalized["sql"]
+    if action_type == "STAT_TEST" and "test" in normalized and "test_type" not in normalized:
+        normalized["test_type"] = normalized["test"]
+    if action_type == "PATTERN_MATCH":
+        if "regex" in normalized and "pattern" not in normalized:
+            normalized["pattern"] = normalized["regex"]
+        if "expect" in normalized and "expect_match" not in normalized:
+            normalized["expect_match"] = normalized["expect"] == "match"
+    if action_type == "HYPOTHESIS":
+        root_column = normalized.get("root_column")
+        downstream = normalized.get("downstream")
+        if root_column is not None and "affected_columns" not in normalized:
+            downstream_columns = downstream if isinstance(downstream, list) else []
+            normalized["affected_columns"] = [root_column, *downstream_columns]
+        if "affected_rows" not in normalized:
+            normalized["affected_rows"] = [0]
+        if root_column is not None and "root_cause_type" not in normalized:
+            normalized["root_cause_type"] = root_column
+    if (
+        action_type == "ROOT_CAUSE"
+        and "indices" in normalized
+        and "error_indices" not in normalized
+    ):
+        normalized["error_indices"] = normalized["indices"]
+    if action_type == "FIX":
+        if "proposed_value" in normalized and "new_value" not in normalized:
+            normalized["new_value"] = normalized["proposed_value"]
+        if "justification" not in normalized:
+            normalized["justification"] = "Agent proposed value via FIX."
+    return normalized

dataforge/bench/core.py CHANGED Viewed

@@ -59,6 +59,7 @@ class SeedBenchmarkResult(BaseModel):
     prompt_tokens: int = Field(ge=0, default=0)
     completion_tokens: int = Field(ge=0, default=0)
     quota_units: float = Field(ge=0.0, default=0.0)
     runtime_s: float = Field(ge=0.0, default=0.0)
     provider: str | None = None
     model: str | None = None
@@ -85,6 +86,8 @@ class AggregateBenchmarkResult(BaseModel):
     avg_steps_std: float | None = None
     quota_units_mean: float | None = None
     quota_units_std: float | None = None
     runtime_s_mean: float | None = None
     runtime_s_std: float | None = None
     provider: str | None = None
@@ -229,6 +232,7 @@ def aggregate_seed_results(
         f1_mean, f1_std = _mean_std([row.f1 or 0.0 for row in ok_rows])
         avg_steps_mean, avg_steps_std = _mean_std([row.avg_steps or 0.0 for row in ok_rows])
         quota_mean, quota_std = _mean_std([row.quota_units for row in ok_rows])
         runtime_mean, runtime_std = _mean_std([row.runtime_s for row in ok_rows])
         aggregates.append(
             AggregateBenchmarkResult(
@@ -248,6 +252,8 @@ def aggregate_seed_results(
                 avg_steps_std=avg_steps_std,
                 quota_units_mean=quota_mean,
                 quota_units_std=quota_std,
                 runtime_s_mean=runtime_mean,
                 runtime_s_std=runtime_std,
                 provider=ok_rows[0].provider,

     prompt_tokens: int = Field(ge=0, default=0)
     completion_tokens: int = Field(ge=0, default=0)
     quota_units: float = Field(ge=0.0, default=0.0)
+    gpu_hours: float = Field(ge=0.0, default=0.0)
     runtime_s: float = Field(ge=0.0, default=0.0)
     provider: str | None = None
     model: str | None = None
     avg_steps_std: float | None = None
     quota_units_mean: float | None = None
     quota_units_std: float | None = None
+    gpu_hours_mean: float | None = None
+    gpu_hours_std: float | None = None
     runtime_s_mean: float | None = None
     runtime_s_std: float | None = None
     provider: str | None = None
         f1_mean, f1_std = _mean_std([row.f1 or 0.0 for row in ok_rows])
         avg_steps_mean, avg_steps_std = _mean_std([row.avg_steps or 0.0 for row in ok_rows])
         quota_mean, quota_std = _mean_std([row.quota_units for row in ok_rows])
+        gpu_hours_mean, gpu_hours_std = _mean_std([row.gpu_hours for row in ok_rows])
         runtime_mean, runtime_std = _mean_std([row.runtime_s for row in ok_rows])
         aggregates.append(
             AggregateBenchmarkResult(
                 avg_steps_std=avg_steps_std,
                 quota_units_mean=quota_mean,
                 quota_units_std=quota_std,
+                gpu_hours_mean=gpu_hours_mean,
+                gpu_hours_std=gpu_hours_std,
                 runtime_s_mean=runtime_mean,
                 runtime_s_std=runtime_std,
                 provider=ok_rows[0].provider,

dataforge/bench/groq_client.py CHANGED Viewed

@@ -1,21 +1,45 @@
-"""Minimal Groq client for benchmark-only LLM baselines."""
 from __future__ import annotations
 import json
 import time
 from dataclasses import dataclass
 from typing import cast
 import httpx
-from tenacity import retry, retry_if_exception, stop_after_attempt, wait_fixed
 def _is_rate_limit_error(exc: BaseException) -> bool:
-    """Return whether an exception is a Groq 429 response."""
     return isinstance(exc, httpx.HTTPStatusError) and exc.response.status_code == 429
 @dataclass(frozen=True, kw_only=True)
 class GroqCompletion:
     """Completion payload plus conservative usage accounting."""
@@ -26,26 +50,50 @@ class GroqCompletion:
     warnings: tuple[str, ...]
-class GroqBenchClient:
-    """Sequential Groq client with fixed 429 retry and spacing."""
     def __init__(
         self,
         *,
         api_key: str,
-        model: str = "llama-3.3-70b-versatile",
         min_interval_s: float = 2.0,
     ) -> None:
         self._api_key = api_key
         self._model = model
         self._min_interval_s = min_interval_s
         self._last_success_at: float | None = None
     @property
     def model(self) -> str:
-        """Return the configured Groq model name."""
         return self._model
     def _respect_spacing(self) -> None:
         """Sleep long enough to keep requests sequential with a fixed gap."""
         if self._last_success_at is None:
@@ -55,33 +103,57 @@ class GroqBenchClient:
         if remaining > 0:
             time.sleep(remaining)
-    @retry(
-        retry=retry_if_exception(_is_rate_limit_error),
-        wait=wait_fixed(2),
-        stop=stop_after_attempt(3),
-        reraise=True,
-    )
     def _post(self, messages: list[dict[str, str]]) -> dict[str, object]:
-        """Issue the underlying Groq chat-completions request."""
         payload = {
             "model": self._model,
             "messages": messages,
             "temperature": 0.0,
         }
-        with httpx.Client(timeout=60.0) as client:
-            response = client.post(
-                "https://api.groq.com/openai/v1/chat/completions",
-                json=payload,
-                headers={
-                    "Authorization": f"Bearer {self._api_key}",
-                    "Content-Type": "application/json",
-                },
-            )
-            response.raise_for_status()
-        return dict(response.json())
     def complete(self, messages: list[dict[str, str]]) -> GroqCompletion:
-        """Send one benchmark completion request to Groq."""
         self._respect_spacing()
         payload = self._post(messages)
         self._last_success_at = time.monotonic()
@@ -92,16 +164,223 @@ class GroqBenchClient:
         completion_tokens = int(usage.get("completion_tokens", 0)) if isinstance(usage, dict) else 0
         if not usage:
             warnings.append("missing_usage_payload")
         try:
             choices = cast(list[dict[str, object]], payload["choices"])
             message = cast(dict[str, object], choices[0]["message"])
             content = str(message["content"])
         except (KeyError, IndexError, TypeError) as exc:
-            raise ValueError(f"Unexpected Groq response payload: {json.dumps(payload)}") from exc
         return GroqCompletion(
             text=content,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
             warnings=tuple(warnings),
         )

+"""Minimal OpenAI-compatible clients for benchmark-only LLM baselines."""
 from __future__ import annotations
 import json
+import logging
 import time
 from dataclasses import dataclass
 from typing import cast
 import httpx
+class ProviderRequestError(RuntimeError):
+    """Raised when a provider rejects a benchmark request payload."""
+class ProviderRateLimitError(ProviderRequestError):
+    """Raised when a provider asks us to wait longer than the configured cap."""
 def _is_rate_limit_error(exc: BaseException) -> bool:
+    """Return whether an exception is an HTTP 429 response."""
     return isinstance(exc, httpx.HTTPStatusError) and exc.response.status_code == 429
+def _is_retryable_provider_error(exc: BaseException) -> bool:
+    """Return whether an HTTP error is worth retrying for teacher collection."""
+    return isinstance(exc, httpx.HTTPStatusError) and exc.response.status_code in {429, 503}
+def _retry_after_s(exc: httpx.HTTPStatusError, *, fallback_s: float) -> float:
+    """Return provider retry-after delay when present."""
+    raw_retry_after = exc.response.headers.get("retry-after")
+    if raw_retry_after is None:
+        return fallback_s
+    try:
+        return max(float(raw_retry_after), fallback_s)
+    except ValueError:
+        return fallback_s
 @dataclass(frozen=True, kw_only=True)
 class GroqCompletion:
     """Completion payload plus conservative usage accounting."""
     warnings: tuple[str, ...]
+class OpenAICompatBenchClient:
+    """Sequential OpenAI-compatible client with fixed 429 retry and spacing."""
     def __init__(
         self,
         *,
         api_key: str,
+        model: str,
+        endpoint: str,
+        provider: str,
         min_interval_s: float = 2.0,
+        max_tokens: int = 512,
+        max_retries: int = 5,
+        max_retry_after_s: float = 120.0,
+        timeout_s: float = 60.0,
     ) -> None:
         self._api_key = api_key
         self._model = model
+        self._endpoint = endpoint
+        self._provider = provider
         self._min_interval_s = min_interval_s
+        self._max_tokens = max_tokens
+        self._max_retries = max_retries
+        self._max_retry_after_s = max_retry_after_s
+        self._timeout_s = timeout_s
         self._last_success_at: float | None = None
+        self._client = httpx.Client(
+            timeout=self._timeout_s,
+            headers={
+                "Authorization": f"Bearer {self._api_key}",
+                "Content-Type": "application/json",
+            },
+        )
     @property
     def model(self) -> str:
+        """Return the configured provider model name."""
         return self._model
+    @property
+    def provider(self) -> str:
+        """Return the configured provider identifier."""
+        return self._provider
     def _respect_spacing(self) -> None:
         """Sleep long enough to keep requests sequential with a fixed gap."""
         if self._last_success_at is None:
         if remaining > 0:
             time.sleep(remaining)
     def _post(self, messages: list[dict[str, str]]) -> dict[str, object]:
+        """Issue the underlying chat-completions request."""
         payload = {
             "model": self._model,
             "messages": messages,
             "temperature": 0.0,
+            "max_tokens": self._max_tokens,
         }
+        last_rate_limit_error: httpx.HTTPStatusError | None = None
+        for attempt in range(self._max_retries):
+            response: httpx.Response | None = None
+            try:
+                response = self._client.post(
+                    self._endpoint,
+                    json=payload,
+                )
+                response.raise_for_status()
+            except httpx.HTTPStatusError as exc:
+                if not _is_retryable_provider_error(exc) or attempt == self._max_retries - 1:
+                    body = exc.response.text[:500].replace("\n", " ")
+                    raise ProviderRequestError(
+                        f"{self._provider} request rejected with HTTP "
+                        f"{exc.response.status_code}: {body}"
+                    ) from exc
+                last_rate_limit_error = exc
+                retry_s = _retry_after_s(exc, fallback_s=2.0 * (attempt + 1))
+                if retry_s > self._max_retry_after_s:
+                    body = exc.response.text[:500].replace("\n", " ")
+                    raise ProviderRateLimitError(
+                        f"{self._provider} rate limit retry-after {retry_s:.2f}s "
+                        f"exceeds cap {self._max_retry_after_s:.2f}s: {body}"
+                    ) from exc
+                logging.getLogger("dataforge.bench.groq_client").warning(
+                    "%s_rate_limit attempt=%d retry_after_s=%.2f",
+                    self._provider,
+                    attempt + 1,
+                    retry_s,
+                )
+                time.sleep(retry_s)
+                continue
+            except httpx.TimeoutException as exc:
+                raise TimeoutError(
+                    f"{self._provider} request timed out after {self._timeout_s:.1f} seconds."
+                ) from exc
+            return dict(response.json())
+        if last_rate_limit_error is not None:
+            raise last_rate_limit_error
+        raise RuntimeError(f"{self._provider} request failed without a response.")
     def complete(self, messages: list[dict[str, str]]) -> GroqCompletion:
+        """Send one benchmark completion request to the configured provider."""
         self._respect_spacing()
         payload = self._post(messages)
         self._last_success_at = time.monotonic()
         completion_tokens = int(usage.get("completion_tokens", 0)) if isinstance(usage, dict) else 0
         if not usage:
             warnings.append("missing_usage_payload")
+            logging.getLogger("dataforge.bench.groq_client").warning(
+                "%s_missing_usage_payload", self._provider
+            )
         try:
             choices = cast(list[dict[str, object]], payload["choices"])
             message = cast(dict[str, object], choices[0]["message"])
             content = str(message["content"])
         except (KeyError, IndexError, TypeError) as exc:
+            raise ValueError(
+                f"Unexpected {self._provider} response payload: {json.dumps(payload)}"
+            ) from exc
         return GroqCompletion(
             text=content,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
             warnings=tuple(warnings),
         )
+class GroqBenchClient(OpenAICompatBenchClient):
+    """Sequential Groq client with fixed 429 retry and spacing."""
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        model: str = "llama-3.3-70b-versatile",
+        min_interval_s: float = 2.0,
+        max_tokens: int = 512,
+        max_retries: int = 5,
+        max_retry_after_s: float = 120.0,
+        timeout_s: float = 60.0,
+    ) -> None:
+        super().__init__(
+            api_key=api_key,
+            model=model,
+            endpoint="https://api.groq.com/openai/v1/chat/completions",
+            provider="groq",
+            min_interval_s=min_interval_s,
+            max_tokens=max_tokens,
+            max_retries=max_retries,
+            max_retry_after_s=max_retry_after_s,
+            timeout_s=timeout_s,
+        )
+class CerebrasBenchClient(OpenAICompatBenchClient):
+    """Sequential Cerebras client with fixed 429 retry and spacing."""
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        model: str = "qwen-3-235b-a22b-instruct-2507",
+        min_interval_s: float = 0.5,
+        max_tokens: int = 512,
+        max_retries: int = 5,
+        max_retry_after_s: float = 120.0,
+        timeout_s: float = 60.0,
+    ) -> None:
+        super().__init__(
+            api_key=api_key,
+            model=model,
+            endpoint="https://api.cerebras.ai/v1/chat/completions",
+            provider="cerebras",
+            min_interval_s=min_interval_s,
+            max_tokens=max_tokens,
+            max_retries=max_retries,
+            max_retry_after_s=max_retry_after_s,
+            timeout_s=timeout_s,
+        )
+class GeminiBenchClient:
+    """Sequential Gemini client adapted to the benchmark completion interface."""
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        model: str = "gemini-3.1-pro-preview",
+        min_interval_s: float = 2.0,
+        max_tokens: int = 512,
+        max_retries: int = 5,
+        max_retry_after_s: float = 120.0,
+        timeout_s: float = 60.0,
+    ) -> None:
+        self._api_key = api_key
+        self._model = model.removeprefix("models/")
+        self._min_interval_s = min_interval_s
+        self._max_tokens = max_tokens
+        self._max_retries = max_retries
+        self._max_retry_after_s = max_retry_after_s
+        self._timeout_s = timeout_s
+        self._last_success_at: float | None = None
+        self._client = httpx.Client(
+            timeout=self._timeout_s,
+            headers={"Content-Type": "application/json"},
+        )
+    @property
+    def model(self) -> str:
+        """Return the configured Gemini model name."""
+        return self._model
+    @property
+    def provider(self) -> str:
+        """Return the provider identifier."""
+        return "gemini"
+    def _respect_spacing(self) -> None:
+        """Sleep long enough to keep requests sequential with a fixed gap."""
+        if self._last_success_at is None:
+            return
+        elapsed = time.monotonic() - self._last_success_at
+        remaining = self._min_interval_s - elapsed
+        if remaining > 0:
+            time.sleep(remaining)
+    def _payload(self, messages: list[dict[str, str]]) -> dict[str, object]:
+        """Convert OpenAI-style chat messages to Gemini generateContent payload."""
+        system_texts: list[str] = []
+        contents: list[dict[str, object]] = []
+        for message in messages:
+            role = message.get("role", "user")
+            content = message.get("content", "")
+            if role == "system":
+                system_texts.append(content)
+                continue
+            gemini_role = "model" if role == "assistant" else "user"
+            contents.append({"role": gemini_role, "parts": [{"text": content}]})
+        payload: dict[str, object] = {
+            "contents": contents,
+            "generationConfig": {
+                "temperature": 0.0,
+                "maxOutputTokens": self._max_tokens,
+            },
+        }
+        if system_texts:
+            payload["systemInstruction"] = {
+                "parts": [{"text": "\n\n".join(system_texts)}],
+            }
+        return payload
+    def _post(self, messages: list[dict[str, str]]) -> dict[str, object]:
+        """Issue the underlying Gemini generateContent request."""
+        endpoint = (
+            f"https://generativelanguage.googleapis.com/v1beta/models/{self._model}:generateContent"
+        )
+        last_rate_limit_error: httpx.HTTPStatusError | None = None
+        for attempt in range(self._max_retries):
+            response: httpx.Response | None = None
+            try:
+                response = self._client.post(
+                    endpoint,
+                    params={"key": self._api_key},
+                    json=self._payload(messages),
+                )
+                response.raise_for_status()
+            except httpx.HTTPStatusError as exc:
+                if not _is_retryable_provider_error(exc) or attempt == self._max_retries - 1:
+                    body = exc.response.text[:500].replace("\n", " ")
+                    raise ProviderRequestError(
+                        f"gemini request rejected with HTTP {exc.response.status_code}: {body}"
+                    ) from exc
+                last_rate_limit_error = exc
+                retry_s = _retry_after_s(exc, fallback_s=2.0 * (attempt + 1))
+                if retry_s > self._max_retry_after_s:
+                    body = exc.response.text[:500].replace("\n", " ")
+                    raise ProviderRateLimitError(
+                        f"gemini rate limit retry-after {retry_s:.2f}s "
+                        f"exceeds cap {self._max_retry_after_s:.2f}s: {body}"
+                    ) from exc
+                logging.getLogger("dataforge.bench.groq_client").warning(
+                    "gemini_rate_limit attempt=%d retry_after_s=%.2f",
+                    attempt + 1,
+                    retry_s,
+                )
+                time.sleep(retry_s)
+                continue
+            except httpx.TimeoutException as exc:
+                raise TimeoutError(
+                    f"gemini request timed out after {self._timeout_s:.1f} seconds."
+                ) from exc
+            return dict(response.json())
+        if last_rate_limit_error is not None:
+            raise last_rate_limit_error
+        raise RuntimeError("gemini request failed without a response.")
+    def complete(self, messages: list[dict[str, str]]) -> GroqCompletion:
+        """Send one benchmark completion request to Gemini."""
+        self._respect_spacing()
+        payload = self._post(messages)
+        self._last_success_at = time.monotonic()
+        warnings: list[str] = []
+        usage = payload.get("usageMetadata", {})
+        prompt_tokens = int(usage.get("promptTokenCount", 0)) if isinstance(usage, dict) else 0
+        completion_tokens = (
+            int(usage.get("candidatesTokenCount", 0)) if isinstance(usage, dict) else 0
+        )
+        if not usage:
+            warnings.append("missing_usage_payload")
+            logging.getLogger("dataforge.bench.groq_client").warning("gemini_missing_usage_payload")
+        try:
+            candidates = cast(list[dict[str, object]], payload["candidates"])
+            content = cast(dict[str, object], candidates[0]["content"])
+            parts = cast(list[dict[str, object]], content["parts"])
+            text = "".join(str(part.get("text", "")) for part in parts)
+        except (KeyError, IndexError, TypeError) as exc:
+            raise ValueError(f"Unexpected gemini response payload: {json.dumps(payload)}") from exc
+        return GroqCompletion(
+            text=text,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            warnings=tuple(warnings),
+        )

dataforge/bench/methods.py CHANGED Viewed

@@ -151,6 +151,40 @@ def _column_stats(
     return stats
 def _extract_json_object(text: str) -> dict[str, object] | None:
     """Parse the first JSON object found in an LLM response string."""
     stripped = text.strip()
@@ -158,6 +192,7 @@ def _extract_json_object(text: str) -> dict[str, object] | None:
         stripped = stripped.strip("`")
         if stripped.lower().startswith("json"):
             stripped = stripped[4:].strip()
     decoder = json.JSONDecoder()
     for offset, char in enumerate(stripped):
         if char != "{":

     return stats
+def _strip_json_line_comments(text: str) -> str:
+    """Remove JavaScript-style line comments outside JSON strings."""
+    result: list[str] = []
+    in_string = False
+    escaped = False
+    index = 0
+    while index < len(text):
+        char = text[index]
+        next_char = text[index + 1] if index + 1 < len(text) else ""
+        if in_string:
+            result.append(char)
+            if escaped:
+                escaped = False
+            elif char == "\\":
+                escaped = True
+            elif char == '"':
+                in_string = False
+            index += 1
+            continue
+        if char == '"':
+            in_string = True
+            result.append(char)
+            index += 1
+            continue
+        if char == "/" and next_char == "/":
+            index += 2
+            while index < len(text) and text[index] not in "\r\n":
+                index += 1
+            continue
+        result.append(char)
+        index += 1
+    return "".join(result)
 def _extract_json_object(text: str) -> dict[str, object] | None:
     """Parse the first JSON object found in an LLM response string."""
     stripped = text.strip()
         stripped = stripped.strip("`")
         if stripped.lower().startswith("json"):
             stripped = stripped[4:].strip()
+    stripped = _strip_json_line_comments(stripped)
     decoder = json.JSONDecoder()
     for offset, char in enumerate(stripped):
         if char != "{":

dataforge/bench/report.py CHANGED Viewed

@@ -69,13 +69,14 @@ def _aggregate_across_datasets(aggregates: list[AggregateBenchmarkResult]) -> li
     for method in methods:
         ok_rows = grouped.get(method, [])
         if not ok_rows:
-            rows.append([method, "Skipped", "Skipped", "Skipped", "Skipped", "Skipped"])
             continue
         p_mean = sum(row.precision_mean or 0.0 for row in ok_rows) / len(ok_rows)
         r_mean = sum(row.recall_mean or 0.0 for row in ok_rows) / len(ok_rows)
         f_mean = sum(row.f1_mean or 0.0 for row in ok_rows) / len(ok_rows)
         step_mean = sum(row.avg_steps_mean or 0.0 for row in ok_rows) / len(ok_rows)
         quota_mean = sum(row.quota_units_mean or 0.0 for row in ok_rows) / len(ok_rows)
         rows.append(
             [
                 method,
@@ -84,6 +85,7 @@ def _aggregate_across_datasets(aggregates: list[AggregateBenchmarkResult]) -> li
                 f"{f_mean:.4f}",
                 f"{step_mean:.2f}",
                 f"{quota_mean:.4f}",
             ]
         )
     return rows
@@ -104,15 +106,13 @@ def build_readme_benchmark_block(agent_output: BenchmarkRunOutput, report_path:
     """Build the generated README benchmark summary block."""
     rows = _aggregate_across_datasets(agent_output.aggregates)
     table = _render_table(
-        ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"],
         rows,
     )
     skip_reasons = _collect_skip_reasons(agent_output.aggregates)
     skip_note = ""
     if skip_reasons:
-        skip_note = (
-            "\n\nSkipped methods in this run: " + "; ".join(skip_reasons)
-        )
     return (
         "Generated from `eval/results/agent_comparison.json`.\n\n"
         f"{table}\n\n"
@@ -140,19 +140,28 @@ def render_benchmark_report(
                 _format_metric(row.f1_mean, row.f1_std),
                 _format_metric(row.avg_steps_mean, row.avg_steps_std),
                 _format_metric(row.quota_units_mean, row.quota_units_std),
             ]
             for row in rows
         ]
         per_dataset_sections.append(
             f"### {dataset.title()}\n\n"
             + _render_table(
-                ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"],
                 table_rows,
             )
         )
     local_summary = _render_table(
-        ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"],
         _aggregate_across_datasets(agent_output.aggregates),
     )
@@ -179,11 +188,7 @@ def render_benchmark_report(
     skip_reasons = _collect_skip_reasons(agent_output.aggregates)
     skip_note = ""
     if skip_reasons:
-        skip_note = (
-            "\nSkipped methods in this reproduced run: "
-            + "; ".join(skip_reasons)
-            + "\n"
-        )
     method_values = agent_output.metadata.get("methods", [])
     dataset_values = agent_output.metadata.get("datasets", [])
@@ -203,6 +208,7 @@ def render_benchmark_report(
         f"- Datasets: {', '.join(datasets)}\n"
         f"- Seeds: {seeds}\n"
         "- Free-tier quota units: `max(llm_calls / 1000, (prompt_tokens + completion_tokens) / 100000)`\n"
         f"{skip_note}\n"
         "## Cross-Dataset Local Results\n\n"
         f"{local_summary}\n\n"
@@ -216,7 +222,7 @@ def render_benchmark_report(
             sota_rows,
         )
         + "\n\n## Methodology\n\n"
-        + "Local rows are reproduced from generated JSON. Citation-only SOTA rows are copied from literature and are not rerun in this repository. Quota units are reported in free-tier fractions rather than dollars.\n"
     )

     for method in methods:
         ok_rows = grouped.get(method, [])
         if not ok_rows:
+            rows.append([method, "Skipped", "Skipped", "Skipped", "Skipped", "Skipped", "Skipped"])
             continue
         p_mean = sum(row.precision_mean or 0.0 for row in ok_rows) / len(ok_rows)
         r_mean = sum(row.recall_mean or 0.0 for row in ok_rows) / len(ok_rows)
         f_mean = sum(row.f1_mean or 0.0 for row in ok_rows) / len(ok_rows)
         step_mean = sum(row.avg_steps_mean or 0.0 for row in ok_rows) / len(ok_rows)
         quota_mean = sum(row.quota_units_mean or 0.0 for row in ok_rows) / len(ok_rows)
+        gpu_hours_mean = sum(row.gpu_hours_mean or 0.0 for row in ok_rows) / len(ok_rows)
         rows.append(
             [
                 method,
                 f"{f_mean:.4f}",
                 f"{step_mean:.2f}",
                 f"{quota_mean:.4f}",
+                f"{gpu_hours_mean:.4f}",
             ]
         )
     return rows
     """Build the generated README benchmark summary block."""
     rows = _aggregate_across_datasets(agent_output.aggregates)
     table = _render_table(
+        ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units", "GPU Hours"],
         rows,
     )
     skip_reasons = _collect_skip_reasons(agent_output.aggregates)
     skip_note = ""
     if skip_reasons:
+        skip_note = "\n\nSkipped methods in this run: " + "; ".join(skip_reasons)
     return (
         "Generated from `eval/results/agent_comparison.json`.\n\n"
         f"{table}\n\n"
                 _format_metric(row.f1_mean, row.f1_std),
                 _format_metric(row.avg_steps_mean, row.avg_steps_std),
                 _format_metric(row.quota_units_mean, row.quota_units_std),
+                _format_metric(row.gpu_hours_mean, row.gpu_hours_std),
             ]
             for row in rows
         ]
         per_dataset_sections.append(
             f"### {dataset.title()}\n\n"
             + _render_table(
+                [
+                    "Method",
+                    "Precision",
+                    "Recall",
+                    "F1",
+                    "Avg Steps",
+                    "Quota Units",
+                    "GPU Hours",
+                ],
                 table_rows,
             )
         )
     local_summary = _render_table(
+        ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units", "GPU Hours"],
         _aggregate_across_datasets(agent_output.aggregates),
     )
     skip_reasons = _collect_skip_reasons(agent_output.aggregates)
     skip_note = ""
     if skip_reasons:
+        skip_note = "\nSkipped methods in this reproduced run: " + "; ".join(skip_reasons) + "\n"
     method_values = agent_output.metadata.get("methods", [])
     dataset_values = agent_output.metadata.get("datasets", [])
         f"- Datasets: {', '.join(datasets)}\n"
         f"- Seeds: {seeds}\n"
         "- Free-tier quota units: `max(llm_calls / 1000, (prompt_tokens + completion_tokens) / 100000)`\n"
+        "- GRPO compute cost is reported as free-tier GPU-hours, not dollars.\n"
         f"{skip_note}\n"
         "## Cross-Dataset Local Results\n\n"
         f"{local_summary}\n\n"
             sota_rows,
         )
         + "\n\n## Methodology\n\n"
+        + "Local rows are reproduced from generated JSON. Citation-only SOTA rows are copied from literature and are not rerun in this repository. LLM quota units are free-tier fractions; GRPO compute cost is GPU-hours, not dollars.\n"
     )

dataforge/bench/runner.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import os
 from pathlib import Path
 from dotenv import load_dotenv
@@ -96,18 +97,21 @@ def run_agent_comparison(
     output_json: Path,
     really_run_big_bench: bool,
     cache_root: Path | None = None,
 ) -> BenchmarkRunOutput:
     """Run the selected benchmark methods across real-world datasets."""
     load_dotenv()
     _validate_inputs(methods, datasets, seeds)
     estimated_calls = estimate_llm_calls(methods=methods, datasets=datasets, seeds=seeds)
     validate_estimated_calls(
         estimated_calls=estimated_calls,
         really_run_big_bench=really_run_big_bench,
     )
-    reproduction_command = _reproduction_command(methods, datasets, seeds)
     records: list[SeedBenchmarkResult] = []
     loaded_datasets = {
         dataset_name: load_real_world_dataset(dataset_name, cache_root=cache_root)
@@ -116,16 +120,45 @@ def run_agent_comparison(
     llm_methods_requested = any(method.startswith("llm_") for method in methods)
     skip_reason = _llm_skip_reason() if llm_methods_requested else None
-    client = (
-        GroqBenchClient(api_key=os.environ["GROQ_API_KEY"])
-        if llm_methods_requested and skip_reason is None
-        else None
-    )
     for dataset_name in datasets:
         dataset = loaded_datasets[dataset_name]
         for method in methods:
             for seed in range(seeds):
                 if method == "random":
                     result = run_random_episode(dataset, seed=seed)
                 elif method == "heuristic":
@@ -159,6 +192,12 @@ def run_agent_comparison(
                 if method == "heuristic":
                     result = result.model_copy(update={"seed": seed})
                 records.append(result)
     aggregates: list[AggregateBenchmarkResult] = aggregate_seed_results(
         records, seeds_requested=seeds

 from __future__ import annotations
 import os
+import sys
 from pathlib import Path
 from dotenv import load_dotenv
     output_json: Path,
     really_run_big_bench: bool,
     cache_root: Path | None = None,
+    reproduction_command: str | None = None,
 ) -> BenchmarkRunOutput:
     """Run the selected benchmark methods across real-world datasets."""
     load_dotenv()
     _validate_inputs(methods, datasets, seeds)
     estimated_calls = estimate_llm_calls(methods=methods, datasets=datasets, seeds=seeds)
+    # Validate call budget before any client instantiation or dataset loads that could
+    # trigger network access in tests with environment variables set.
     validate_estimated_calls(
         estimated_calls=estimated_calls,
         really_run_big_bench=really_run_big_bench,
     )
+    reproduction_command = reproduction_command or _reproduction_command(methods, datasets, seeds)
     records: list[SeedBenchmarkResult] = []
     loaded_datasets = {
         dataset_name: load_real_world_dataset(dataset_name, cache_root=cache_root)
     llm_methods_requested = any(method.startswith("llm_") for method in methods)
     skip_reason = _llm_skip_reason() if llm_methods_requested else None
+    client = None
+    if llm_methods_requested and skip_reason is None:
+        # Allow env-driven tuning for tiny CI checks.
+        model = os.environ.get("DATAFORGE_GROQ_MODEL", "llama-3.3-70b-versatile")
+        try:
+            min_interval_s = float(os.environ.get("DATAFORGE_GROQ_MIN_INTERVAL_S", "1.0"))
+        except ValueError:
+            min_interval_s = 1.0
+        try:
+            timeout_s = float(os.environ.get("DATAFORGE_GROQ_TIMEOUT_S", "30"))
+        except ValueError:
+            timeout_s = 30.0
+        try:
+            max_tokens = int(os.environ.get("DATAFORGE_GROQ_MAX_TOKENS", "256"))
+        except ValueError:
+            max_tokens = 256
+        try:
+            max_retries = int(os.environ.get("DATAFORGE_GROQ_MAX_RETRIES", "3"))
+        except ValueError:
+            max_retries = 3
+        client = GroqBenchClient(
+            api_key=os.environ["GROQ_API_KEY"],
+            model=model,
+            min_interval_s=min_interval_s,
+            max_tokens=max_tokens,
+            max_retries=max_retries,
+            timeout_s=timeout_s,
+        )
     for dataset_name in datasets:
         dataset = loaded_datasets[dataset_name]
         for method in methods:
             for seed in range(seeds):
+                if os.environ.get("DATAFORGE_BENCH_VERBOSE"):
+                    print(
+                        f"[dataforge bench] start method={method} dataset={dataset_name} seed={seed}",
+                        file=sys.stderr,
+                        flush=True,
+                    )
                 if method == "random":
                     result = run_random_episode(dataset, seed=seed)
                 elif method == "heuristic":
                 if method == "heuristic":
                     result = result.model_copy(update={"seed": seed})
                 records.append(result)
+                if os.environ.get("DATAFORGE_BENCH_VERBOSE"):
+                    print(
+                        f"[dataforge bench] done  method={method} dataset={dataset_name} seed={seed} status={result.status}",
+                        file=sys.stderr,
+                        flush=True,
+                    )
     aggregates: list[AggregateBenchmarkResult] = aggregate_seed_results(
         records, seeds_requested=seeds

dataforge/causal/__init__.py CHANGED Viewed

	@@ -1 +1,21 @@
1	- """Causal analysis ~~package~~ ~~scaffolding~~ for DataForge."""

+"""Causal analysis primitives for DataForge root-cause diagnosis."""
+from dataforge.causal.dag import CausalDAG, CausalEdge
+from dataforge.causal.pc import CausalDiscoveryResult, discover_causal_dag
+from dataforge.causal.root_cause import (
+    CausalRootCauseAnalyzer,
+    ErrorEvidence,
+    RootCauseResult,
+    minimal_root_set,
+)
+__all__ = [
+    "CausalDAG",
+    "CausalDiscoveryResult",
+    "CausalEdge",
+    "CausalRootCauseAnalyzer",
+    "ErrorEvidence",
+    "RootCauseResult",
+    "discover_causal_dag",
+    "minimal_root_set",
+]

dataforge/causal/dag.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""Column-level causal DAG utilities for root-cause analysis."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+import networkx as nx  # type: ignore[import-untyped]
+__all__ = ["CausalDAG", "CausalEdge"]
+@dataclass(frozen=True)
+class CausalEdge:
+    """Metadata for a directed causal edge.
+    Args:
+        source: Source column name.
+        target: Target column name.
+        confidence: Confidence in the directed influence, from 0.0 to 1.0.
+        provenance: Human-readable source of the edge.
+    """
+    source: str
+    target: str
+    confidence: float
+    provenance: str
+class CausalDAG:
+    """Acyclic directed graph whose nodes are dataset columns.
+    Args:
+        nodes: Optional initial column names.
+    Example:
+        >>> dag = CausalDAG(["discount_pct", "order_total"])
+        >>> dag.add_edge("discount_pct", "order_total", confidence=0.9, provenance="fd")
+        >>> dag.is_reachable("discount_pct", "order_total")
+        True
+    """
+    def __init__(self, nodes: list[str] | tuple[str, ...] = ()) -> None:
+        self._graph: nx.DiGraph[Any] = nx.DiGraph()
+        self._graph.add_nodes_from(nodes)
+    @property
+    def nodes(self) -> tuple[str, ...]:
+        """Return graph nodes in insertion order."""
+        return tuple(str(node) for node in self._graph.nodes)
+    @property
+    def edges(self) -> tuple[CausalEdge, ...]:
+        """Return directed edges with metadata."""
+        result: list[CausalEdge] = []
+        for source, target, attrs in self._graph.edges(data=True):
+            result.append(
+                CausalEdge(
+                    source=str(source),
+                    target=str(target),
+                    confidence=float(attrs.get("confidence", 0.0)),
+                    provenance=str(attrs.get("provenance", "unknown")),
+                )
+            )
+        return tuple(result)
+    def add_node(self, column: str) -> None:
+        """Add a column node if it is not already present.
+        Args:
+            column: Column name.
+        """
+        self._graph.add_node(column)
+    def add_edge(
+        self,
+        source: str,
+        target: str,
+        *,
+        confidence: float,
+        provenance: str,
+    ) -> None:
+        """Add a directed causal edge while preserving acyclicity.
+        Args:
+            source: Source column name.
+            target: Target column name.
+            confidence: Confidence score from 0.0 to 1.0.
+            provenance: Source of the edge.
+        Raises:
+            ValueError: If the edge is self-referential or creates a cycle.
+        """
+        if source == target:
+            raise ValueError("Causal DAG does not allow self-edges")
+        self._graph.add_node(source)
+        self._graph.add_node(target)
+        if nx.has_path(self._graph, target, source):
+            raise ValueError(f"Adding {source!r} -> {target!r} would create a cycle")
+        bounded = max(0.0, min(1.0, confidence))
+        self._graph.add_edge(source, target, confidence=bounded, provenance=provenance)
+    def successors(self, column: str) -> tuple[str, ...]:
+        """Return direct downstream columns for a node.
+        Args:
+            column: Column name.
+        Returns:
+            A tuple of direct successor column names.
+        """
+        if column not in self._graph:
+            return ()
+        return tuple(str(node) for node in self._graph.successors(column))
+    def is_reachable(self, source: str, target: str) -> bool:
+        """Return whether target is reachable from source.
+        Args:
+            source: Source column name.
+            target: Target column name.
+        Returns:
+            True if source equals target or a directed path exists.
+        """
+        if source == target:
+            return True
+        if source not in self._graph or target not in self._graph:
+            return False
+        return bool(nx.has_path(self._graph, source, target))
+    def path_confidence(self, source: str, target: str) -> float:
+        """Return the weakest-edge confidence on the shortest path.
+        Args:
+            source: Source column name.
+            target: Target column name.
+        Returns:
+            Confidence in [0.0, 1.0], or 0.0 when no path exists.
+        """
+        if source == target:
+            return 1.0
+        if not self.is_reachable(source, target):
+            return 0.0
+        path = nx.shortest_path(self._graph, source, target)
+        confidences = [
+            float(self._graph.edges[path[i], path[i + 1]].get("confidence", 0.0))
+            for i in range(len(path) - 1)
+        ]
+        return min(confidences, default=0.0)
+    def minimal_root_columns(self, columns: list[str] | tuple[str, ...]) -> tuple[str, ...]:
+        """Return selected columns that are not downstream of another selection.
+        Args:
+            columns: Selected error columns.
+        Returns:
+            Minimal root columns in first-seen order.
+        """
+        unique: list[str] = []
+        for column in columns:
+            if column not in unique:
+                unique.append(column)
+        roots: list[str] = []
+        for column in unique:
+            has_upstream = any(
+                other != column and self.is_reachable(other, column) for other in unique
+            )
+            if not has_upstream:
+                roots.append(column)
+        return tuple(roots)

dataforge/causal/pc.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""PC-based causal DAG discovery with functional-dependency priors."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+import numpy as np
+import pandas as pd
+from scipy.stats import chi2_contingency  # type: ignore[import-untyped]
+from dataforge.causal.dag import CausalDAG
+from dataforge.verifier.schema import Schema
+__all__ = ["CausalDiscoveryResult", "discover_causal_dag"]
+@dataclass(frozen=True)
+class CausalDiscoveryResult:
+    """Result of causal discovery.
+    Args:
+        dag: Directed acyclic graph over columns.
+        confidence_report: Column-pair confidence or diagnostic metadata.
+        warnings: Non-fatal discovery warnings.
+    """
+    dag: CausalDAG
+    confidence_report: dict[str, float] = field(default_factory=dict)
+    warnings: tuple[str, ...] = ()
+def discover_causal_dag(
+    df: pd.DataFrame,
+    schema: Schema | None = None,
+    *,
+    alpha: float = 0.05,
+) -> CausalDiscoveryResult:
+    """Infer a deterministic causal DAG from tabular data and FD priors.
+    Args:
+        df: Input DataFrame.
+        schema: Optional declared schema with functional dependencies.
+        alpha: Significance threshold for independence checks.
+    Returns:
+        CausalDiscoveryResult. A DAG is returned even if PC orientation is
+        underdetermined; low-confidence edges are tagged as such.
+    """
+    columns = [str(column) for column in df.columns]
+    dag = CausalDAG(columns)
+    report: dict[str, float] = {}
+    warnings: list[str] = []
+    if schema is not None:
+        for fd in schema.functional_dependencies:
+            for determinant in fd.determinant:
+                _try_add_edge(
+                    dag,
+                    determinant,
+                    fd.dependent,
+                    confidence=0.95,
+                    provenance="functional_dependency_prior",
+                    warnings=warnings,
+                )
+                report[f"{determinant}->{fd.dependent}"] = 0.95
+    cleaned = _prepare_for_pc(df)
+    pc_edges, pc_warning = _run_causal_learn_pc(cleaned.to_numpy(), columns, alpha)
+    if pc_warning:
+        warnings.append(pc_warning)
+    for source, target in pc_edges:
+        _try_add_edge(
+            dag,
+            source,
+            target,
+            confidence=0.55,
+            provenance="causal_learn_pc",
+            warnings=warnings,
+        )
+        report.setdefault(f"{source}->{target}", 0.55)
+    for source, target, confidence in _pairwise_dependency_edges(df, alpha):
+        _try_add_edge(
+            dag,
+            source,
+            target,
+            confidence=confidence,
+            provenance="pairwise_ci_fallback",
+            warnings=warnings,
+        )
+        report.setdefault(f"{source}->{target}", confidence)
+    return CausalDiscoveryResult(dag=dag, confidence_report=report, warnings=tuple(warnings))
+def _prepare_for_pc(df: pd.DataFrame) -> pd.DataFrame:
+    """Return numeric data with no NaN values for causal-learn PC."""
+    prepared = pd.DataFrame(index=df.index)
+    for column in df.columns:
+        numeric = pd.to_numeric(df[column], errors="coerce")
+        if numeric.notna().sum() >= max(2, int(0.5 * len(df))):
+            fill = float(numeric.median()) if numeric.notna().any() else 0.0
+            prepared[str(column)] = numeric.fillna(fill)
+        else:
+            codes, _ = pd.factorize(df[column].astype("string").fillna("<missing>"), sort=True)
+            prepared[str(column)] = codes.astype(float)
+    return prepared.fillna(0.0)
+def _run_causal_learn_pc(
+    data: np.ndarray[Any, Any], columns: list[str], alpha: float
+) -> tuple[list[tuple[str, str]], str | None]:
+    """Run causal-learn PC and return deterministic directed edges."""
+    try:
+        from causallearn.search.ConstraintBased.PC import pc  # type: ignore[import-untyped]
+        result = pc(data, alpha=alpha, indep_test="fisherz", stable=True, show_progress=False)
+    except Exception as exc:
+        return [], f"causal-learn PC unavailable or failed: {exc}"
+    matrix = getattr(getattr(result, "G", None), "graph", None)
+    if matrix is None:
+        return [], "causal-learn PC returned no adjacency matrix"
+    edges: list[tuple[str, str]] = []
+    arr = np.asarray(matrix)
+    for i, source in enumerate(columns):
+        for j, target in enumerate(columns):
+            if i >= j or i >= arr.shape[0] or j >= arr.shape[1]:
+                continue
+            if arr[i, j] != 0 or arr[j, i] != 0:
+                edges.append((source, target))
+    return edges, None
+def _pairwise_dependency_edges(df: pd.DataFrame, alpha: float) -> list[tuple[str, str, float]]:
+    """Return deterministic low-confidence edges for dependent column pairs."""
+    columns = [str(column) for column in df.columns]
+    edges: list[tuple[str, str, float]] = []
+    for i, source in enumerate(columns):
+        for target in columns[i + 1 :]:
+            p_value = _pairwise_p_value(df[source], df[target])
+            if p_value < alpha:
+                confidence = max(0.25, min(0.75, 1.0 - p_value))
+                edges.append((source, target, round(confidence, 4)))
+    return edges
+def _pairwise_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
+    """Return a p-value using categorical, continuous, or mixed tests."""
+    left_numeric = pd.to_numeric(left, errors="coerce")
+    right_numeric = pd.to_numeric(right, errors="coerce")
+    left_cont = left_numeric.notna().sum() >= max(5, int(0.8 * len(left)))
+    right_cont = right_numeric.notna().sum() >= max(5, int(0.8 * len(right)))
+    if left_cont and right_cont:
+        return _hsic_p_value(
+            left_numeric.fillna(left_numeric.median()), right_numeric.fillna(right_numeric.median())
+        )
+    if not left_cont and not right_cont:
+        return _chi_squared_p_value(left, right)
+    return _mutual_information_p_value(left, right)
+def _chi_squared_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
+    """Return chi-squared independence p-value for categorical pairs."""
+    table = pd.crosstab(
+        left.astype("string").fillna("<missing>"), right.astype("string").fillna("<missing>")
+    )
+    if table.shape[0] < 2 or table.shape[1] < 2:
+        return 1.0
+    _, p_value, _, _ = chi2_contingency(table)
+    return float(p_value)
+def _hsic_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
+    """Return HSIC p-value for continuous pairs, with correlation fallback."""
+    x = left.to_numpy(dtype=float).reshape(-1, 1)
+    y = right.to_numpy(dtype=float).reshape(-1, 1)
+    try:
+        from hyppo.independence import Hsic  # type: ignore[import-untyped]
+        _, p_value = Hsic().test(x, y, reps=100, auto=True)
+        return float(p_value)
+    except Exception:
+        corr = abs(float(np.corrcoef(x[:, 0], y[:, 0])[0, 1]))
+        return 0.0 if corr > 0.75 else 1.0
+def _mutual_information_p_value(left: pd.Series[Any], right: pd.Series[Any]) -> float:
+    """Return a bounded pseudo p-value from binned mutual information."""
+    left_codes = _codes(left)
+    right_codes = _codes(right)
+    table = pd.crosstab(left_codes, right_codes)
+    total = float(table.to_numpy().sum())
+    if total == 0.0 or table.shape[0] < 2 or table.shape[1] < 2:
+        return 1.0
+    joint = table.to_numpy(dtype=float) / total
+    px = joint.sum(axis=1, keepdims=True)
+    py = joint.sum(axis=0, keepdims=True)
+    expected = px @ py
+    mask = joint > 0
+    mi = float((joint[mask] * np.log(joint[mask] / expected[mask])).sum())
+    return float(np.exp(-mi))
+def _codes(series: pd.Series[Any]) -> np.ndarray[Any, Any]:
+    """Return stable integer codes for a mixed-type series."""
+    numeric = pd.to_numeric(series, errors="coerce")
+    if numeric.notna().sum() >= max(5, int(0.8 * len(series))):
+        return pd.qcut(
+            numeric.fillna(numeric.median()), q=4, duplicates="drop"
+        ).cat.codes.to_numpy()
+    codes, _ = pd.factorize(series.astype("string").fillna("<missing>"), sort=True)
+    return codes
+def _try_add_edge(
+    dag: CausalDAG,
+    source: str,
+    target: str,
+    *,
+    confidence: float,
+    provenance: str,
+    warnings: list[str],
+) -> None:
+    """Add an edge or record the cycle warning."""
+    try:
+        dag.add_edge(source, target, confidence=confidence, provenance=provenance)
+    except ValueError as exc:
+        warnings.append(str(exc))

dataforge/causal/root_cause.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""Minimal root-cause selection over detected errors and a causal DAG."""
+from __future__ import annotations
+from typing import Any, Protocol
+from pydantic import BaseModel, Field
+from dataforge.causal.dag import CausalDAG
+__all__ = [
+    "CausalRootCauseAnalyzer",
+    "ErrorEvidence",
+    "RootCauseResult",
+    "evidence_from_issue",
+    "minimal_root_set",
+]
+class _IssueLike(Protocol):
+    """Protocol for objects with row/column issue fields."""
+    row: int
+    column: str
+    issue_type: str
+class ErrorEvidence(BaseModel):
+    """Column-mapped detected error used for causal root-cause analysis.
+    Args:
+        index: Zero-based error index in the caller's selected issue list.
+        row: Row index where the error was detected.
+        column: Column where the error was detected.
+        issue_type: Machine-readable issue type.
+    """
+    index: int = Field(ge=0)
+    row: int = Field(ge=0)
+    column: str = Field(min_length=1)
+    issue_type: str = Field(min_length=1)
+    model_config = {"frozen": True}
+class RootCauseResult(BaseModel):
+    """Structured result returned by the root-cause analyzer.
+    Args:
+        root_indices: Minimal selected error indices.
+        root_columns: Root columns corresponding to root_indices.
+        covered_indices: Selected error indices covered by the root set.
+        confidence: Mean path confidence from roots to covered errors.
+        explanation: Human-readable explanation of the selected roots.
+    """
+    root_indices: list[int]
+    root_columns: list[str]
+    covered_indices: list[int]
+    confidence: float
+    explanation: str
+    model_config = {"frozen": True}
+class CausalRootCauseAnalyzer:
+    """Compute minimal root causes for selected detected errors.
+    Args:
+        dag: Column-level causal DAG.
+    Example:
+        >>> dag = CausalDAG(["discount_pct", "order_total"])
+        >>> dag.add_edge("discount_pct", "order_total", confidence=0.9, provenance="formula")
+        >>> errors = [
+        ...     ErrorEvidence(index=0, row=1, column="discount_pct", issue_type="bad"),
+        ...     ErrorEvidence(index=1, row=1, column="order_total", issue_type="bad"),
+        ... ]
+        >>> CausalRootCauseAnalyzer(dag).analyze(errors).root_indices
+        [0]
+    """
+    def __init__(self, dag: CausalDAG) -> None:
+        self._dag = dag
+    def analyze(self, errors: list[ErrorEvidence] | tuple[ErrorEvidence, ...]) -> RootCauseResult:
+        """Return the minimal root set for the selected errors.
+        Args:
+            errors: Selected detected errors.
+        Returns:
+            RootCauseResult with roots, coverage, confidence, and explanation.
+        """
+        if not errors:
+            return RootCauseResult(
+                root_indices=[],
+                root_columns=[],
+                covered_indices=[],
+                confidence=0.0,
+                explanation="No errors were supplied.",
+            )
+        roots: list[ErrorEvidence] = []
+        for candidate in errors:
+            if not self._has_upstream_selected_error(candidate, errors):
+                roots.append(candidate)
+        covered: list[int] = []
+        path_confidences: list[float] = []
+        for error in errors:
+            for root in roots:
+                if root.column == error.column or self._dag.is_reachable(root.column, error.column):
+                    covered.append(error.index)
+                    path_confidences.append(self._dag.path_confidence(root.column, error.column))
+                    break
+        confidence = (
+            round(sum(path_confidences) / len(path_confidences), 4) if path_confidences else 0.0
+        )
+        root_columns = [root.column for root in roots]
+        return RootCauseResult(
+            root_indices=[root.index for root in roots],
+            root_columns=root_columns,
+            covered_indices=covered,
+            confidence=confidence,
+            explanation=self._explain(root_columns, len(covered), len(errors)),
+        )
+    def _has_upstream_selected_error(
+        self,
+        candidate: ErrorEvidence,
+        errors: list[ErrorEvidence] | tuple[ErrorEvidence, ...],
+    ) -> bool:
+        """Return whether another selected error causally precedes candidate."""
+        for other in errors:
+            if other.index == candidate.index:
+                continue
+            if other.column == candidate.column and other.index < candidate.index:
+                return True
+            if other.column != candidate.column and self._dag.is_reachable(
+                other.column, candidate.column
+            ):
+                return True
+        return False
+    @staticmethod
+    def _explain(root_columns: list[str], covered_count: int, total_count: int) -> str:
+        """Build a compact result explanation."""
+        if not root_columns:
+            return "No minimal roots were found."
+        joined = ", ".join(root_columns)
+        return f"Selected {joined} as minimal roots covering {covered_count}/{total_count} errors."
+def minimal_root_set(
+    errors: list[ErrorEvidence] | tuple[ErrorEvidence, ...], dag: CausalDAG
+) -> RootCauseResult:
+    """Convenience wrapper for CausalRootCauseAnalyzer.
+    Args:
+        errors: Selected detected errors.
+        dag: Column-level causal DAG.
+    Returns:
+        Minimal root-cause result.
+    """
+    return CausalRootCauseAnalyzer(dag).analyze(errors)
+def evidence_from_issue(index: int, issue: _IssueLike | dict[str, Any]) -> ErrorEvidence:
+    """Build ErrorEvidence from an Issue-like object or dictionary.
+    Args:
+        index: Error index to assign.
+        issue: Object or dictionary with row/column/type fields.
+    Returns:
+        ErrorEvidence instance.
+    """
+    if isinstance(issue, dict):
+        return ErrorEvidence(
+            index=index,
+            row=int(issue.get("row", 0)),
+            column=str(issue.get("column", "")),
+            issue_type=str(issue.get("type", issue.get("issue_type", "unknown"))),
+        )
+    return ErrorEvidence(
+        index=index,
+        row=int(issue.row),
+        column=str(issue.column),
+        issue_type=str(issue.issue_type),
+    )

dataforge/cli/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Typer application entrypoint for DataForge.
 Each CLI subcommand is defined in its own module under ``dataforge.cli.*``
 and registered here. The ``app`` object is the entry point referenced by
@@ -7,13 +7,16 @@ and registered here. The ``app`` object is the entry point referenced by
 import typer
 from dataforge.cli.bench import bench
 from dataforge.cli.profile import profile
 from dataforge.cli.repair import repair
 from dataforge.cli.revert import revert
 app: typer.Typer = typer.Typer(
-    help="DataForge — AI-powered data-quality detection and repair.",
     no_args_is_help=True,
 )
@@ -28,15 +31,18 @@ def _main(
         is_eager=True,
     ),
 ) -> None:
-    """DataForge — AI-powered data-quality detection and repair."""
     if version:
         from dataforge import __version__
-        typer.echo(f"dataforge {__version__}")
         raise typer.Exit()
 app.command(name="profile")(profile)
 app.command(name="repair")(repair)
 app.command(name="revert")(revert)
 app.command(name="bench")(bench)

+"""Typer application entrypoint for DataForge15.
 Each CLI subcommand is defined in its own module under ``dataforge.cli.*``
 and registered here. The ``app`` object is the entry point referenced by
 import typer
+from dataforge.cli.audit import audit
 from dataforge.cli.bench import bench
 from dataforge.cli.profile import profile
 from dataforge.cli.repair import repair
+from dataforge.cli.release import release_app
 from dataforge.cli.revert import revert
+from dataforge.cli.watch import watch
 app: typer.Typer = typer.Typer(
+    help="DataForge15 - AI-powered data-quality detection and repair.",
     no_args_is_help=True,
 )
         is_eager=True,
     ),
 ) -> None:
+    """DataForge15 - AI-powered data-quality detection and repair."""
     if version:
         from dataforge import __version__
+        typer.echo(f"dataforge15 {__version__}")
         raise typer.Exit()
 app.command(name="profile")(profile)
 app.command(name="repair")(repair)
 app.command(name="revert")(revert)
+app.command(name="audit")(audit)
 app.command(name="bench")(bench)
+app.command(name="watch")(watch)
+app.add_typer(release_app, name="release")

dataforge/cli/audit.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""CLI subcommand: ``dataforge audit <txn_id>``."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Annotated
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from dataforge.transactions import TransactionAuditVerdict, verify_transaction_log
+_console = Console(stderr=True)
+def audit(
+    txn_id: Annotated[
+        str,
+        typer.Argument(help="Transaction identifier to audit."),
+    ],
+    search_root: Annotated[
+        Path | None,
+        typer.Option(
+            "--search-root",
+            help="Root directory used to locate the transaction log.",
+            exists=True,
+            file_okay=False,
+            dir_okay=True,
+            readable=True,
+        ),
+    ] = None,
+    log_path: Annotated[
+        Path | None,
+        typer.Option(
+            "--log-path",
+            help="Explicit JSONL transaction log path.",
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+        ),
+    ] = None,
+    json_output: Annotated[
+        bool,
+        typer.Option("--json", help="Print the audit report as JSON."),
+    ] = False,
+) -> None:
+    """Verify a transaction log's local hash chain."""
+    report = verify_transaction_log(txn_id, log_path=log_path, search_root=search_root)
+    if json_output:
+        typer.echo(json.dumps(report.model_dump(mode="json"), indent=2, sort_keys=True))
+    else:
+        style = "green" if report.verdict == TransactionAuditVerdict.VERIFIED else "red"
+        body = (
+            f"Verdict: [bold]{report.verdict.value}[/bold]\n"
+            f"Transaction: {report.txn_id or txn_id}\n"
+            f"Events: {report.event_count}\n"
+            f"Head SHA-256: {report.head_sha256 or 'n/a'}"
+        )
+        if report.errors:
+            body += "\n\n" + "\n".join(f"- {error}" for error in report.errors)
+        _console.print(Panel(body, title="Transaction Audit", style=style))
+    if report.verdict == TransactionAuditVerdict.VERIFIED:
+        raise typer.Exit(code=0)
+    if report.verdict == TransactionAuditVerdict.LEGACY_UNVERIFIED:
+        raise typer.Exit(code=1)
+    raise typer.Exit(code=2)

dataforge/cli/bench.py CHANGED Viewed

@@ -2,17 +2,18 @@
 from __future__ import annotations
 from pathlib import Path
-from typing import Annotated
 import typer
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
-from dataforge.bench.runner import run_agent_comparison
 _console = Console(stderr=True)
 def _parse_csv_list(raw_value: str) -> list[str]:
@@ -21,6 +22,16 @@ def _parse_csv_list(raw_value: str) -> list[str]:
     return [value for value in values if value]
 def bench(
     methods: Annotated[
         str,
@@ -54,10 +65,14 @@ def bench(
             help="Where to write eval/results/agent_comparison.json.",
         ),
     ] = Path("eval/results/agent_comparison.json"),
 ) -> None:
     """Run real-world benchmark methods across cached benchmark datasets."""
     try:
-        output = run_agent_comparison(
             methods=_parse_csv_list(methods),
             datasets=_parse_csv_list(datasets),
             seeds=seeds,
@@ -74,6 +89,10 @@ def bench(
         )
         raise typer.Exit(code=2) from exc
     table = Table(title="DataForge Benchmark Summary")
     table.add_column("Method")
     table.add_column("Dataset")

 from __future__ import annotations
+import json
+from collections.abc import Callable
 from pathlib import Path
+from typing import Annotated, Any
 import typer
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
 _console = Console(stderr=True)
+run_agent_comparison: Callable[..., Any] | None = None
 def _parse_csv_list(raw_value: str) -> list[str]:
     return [value for value in values if value]
+def _runner() -> Callable[..., Any]:
+    """Load the benchmark runner lazily so core CLI imports stay lightweight."""
+    global run_agent_comparison
+    if run_agent_comparison is None:
+        from dataforge.bench.runner import run_agent_comparison as loaded_runner
+        run_agent_comparison = loaded_runner
+    return run_agent_comparison
 def bench(
     methods: Annotated[
         str,
             help="Where to write eval/results/agent_comparison.json.",
         ),
     ] = Path("eval/results/agent_comparison.json"),
+    json_output: Annotated[
+        bool,
+        typer.Option("--json", help="Print benchmark results as JSON."),
+    ] = False,
 ) -> None:
     """Run real-world benchmark methods across cached benchmark datasets."""
     try:
+        output = _runner()(
             methods=_parse_csv_list(methods),
             datasets=_parse_csv_list(datasets),
             seeds=seeds,
         )
         raise typer.Exit(code=2) from exc
+    if json_output:
+        typer.echo(json.dumps(output.model_dump(mode="json"), indent=2, sort_keys=True))
+        return
     table = Table(title="DataForge Benchmark Summary")
     table.add_column("Method")
     table.add_column("Dataset")

dataforge/cli/common.py CHANGED Viewed

@@ -3,13 +3,14 @@
 from __future__ import annotations
 from collections.abc import Iterable
 from pathlib import Path
 from typing import cast
-import pandas as pd
 import typer
 import yaml
 from dataforge.verifier.schema import (
     AggregateDependency,
     AggregateLiteral,
@@ -18,6 +19,27 @@ from dataforge.verifier.schema import (
     Schema,
 )
 def schema_from_mapping(raw_mapping: object) -> Schema:
     """Build a Schema from a raw YAML mapping-like payload.
@@ -149,13 +171,13 @@ def load_schema(schema_path: Path) -> Schema:
     return schema_from_mapping(raw)
-def read_csv(path: Path) -> pd.DataFrame:
     """Read a CSV using conservative string-preserving defaults.
     Args:
         path: CSV path.
     Returns:
-        A DataFrame with string-preserved values.
     """
-    return pd.read_csv(path, dtype=str, keep_default_na=False, na_filter=False)

 from __future__ import annotations
 from collections.abc import Iterable
+from importlib import resources
 from pathlib import Path
 from typing import cast
 import typer
 import yaml
+from dataforge.table import Table, read_csv as read_table_csv
 from dataforge.verifier.schema import (
     AggregateDependency,
     AggregateLiteral,
     Schema,
 )
+_PACKAGED_DEMO_FIXTURES = {
+    "fixtures/hospital_10rows.csv": "fixtures/hospital_10rows.csv",
+    "fixtures/hospital_schema.yaml": "fixtures/hospital_schema.yaml",
+}
+def resolve_cli_path(path: Path) -> Path:
+    """Resolve a user path, including DataForge's packaged demo fixture aliases."""
+    if path.exists():
+        return path
+    normalized = path.as_posix().replace("\\", "/").lstrip("./")
+    packaged_name = _PACKAGED_DEMO_FIXTURES.get(normalized)
+    if packaged_name is None:
+        return path
+    fixture = resources.files("dataforge").joinpath(packaged_name)
+    if not fixture.is_file():
+        return path
+    return Path(str(fixture))
 def schema_from_mapping(raw_mapping: object) -> Schema:
     """Build a Schema from a raw YAML mapping-like payload.
     return schema_from_mapping(raw)
+def read_csv(path: Path) -> Table:
     """Read a CSV using conservative string-preserving defaults.
     Args:
         path: CSV path.
     Returns:
+        A string-preserving DataForge table.
     """
+    return read_table_csv(path)

dataforge/cli/profile.py CHANGED Viewed

@@ -1,31 +1,46 @@
 """CLI subcommand: ``dataforge profile <path> [--schema <yaml>]``.
 Reads a CSV file, runs all detectors, and renders detected issues as a
-rich-formatted terminal table.  Exit code 0 if no UNSAFE issues; 1 otherwise.
 """
 from __future__ import annotations
 from pathlib import Path
-from typing import Annotated
 import typer
 from rich.console import Console
-from dataforge.cli.common import load_schema, read_csv
 from dataforge.detectors import run_all_detectors
-from dataforge.detectors.base import Schema, Severity
 from dataforge.ui.profile_view import render_profile_table
 _console = Console(stderr=True)
 def profile(
     path: Annotated[
         Path,
         typer.Argument(
-            exists=True,
-            readable=True,
             help="Path to the CSV file to profile.",
         ),
     ],
@@ -33,22 +48,36 @@ def profile(
         Path | None,
         typer.Option(
             "--schema",
-            exists=True,
-            readable=True,
             help="Path to a YAML schema file with column types and FDs.",
         ),
     ] = None,
 ) -> None:
     """Profile a CSV file for data-quality issues.
     Reads the CSV, runs all detectors (type_mismatch, decimal_shift,
     fd_violation), and renders a rich-formatted table of detected issues.
-    Exit code 0 if no UNSAFE issues are found; 1 if any UNSAFE issues exist.
     """
     # Load the CSV with dtype=str to avoid pandas type-coercion artifacts.
     try:
-        df = read_csv(path)
     except Exception as exc:
         _console.print(f"[bold red]Error reading CSV:[/bold red] {exc}")
         raise typer.Exit(code=2) from exc
@@ -56,16 +85,32 @@ def profile(
     # Optionally load schema.
     parsed_schema: Schema | None = None
     if schema is not None:
-        parsed_schema = load_schema(schema)
     # Run all detectors.
     issues = run_all_detectors(df, parsed_schema)
     # Render the results.
-    output_console = Console()
-    render_profile_table(issues, output_console, file_path=str(path))
-    # Exit code based on UNSAFE issues.
-    has_unsafe = any(i.severity == Severity.UNSAFE for i in issues)
-    if has_unsafe:
         raise typer.Exit(code=1)

 """CLI subcommand: ``dataforge profile <path> [--schema <yaml>]``.
 Reads a CSV file, runs all detectors, and renders detected issues as a
+rich-formatted terminal table. Diagnostics exit 0 by default; use
+``--fail-on`` for CI gating.
 """
 from __future__ import annotations
+import json
+from collections.abc import Sequence
 from pathlib import Path
+from typing import Annotated, Literal
 import typer
 from rich.console import Console
+from dataforge.cli.common import load_schema, read_csv, resolve_cli_path
 from dataforge.detectors import run_all_detectors
+from dataforge.detectors.base import Issue, Schema, Severity
 from dataforge.ui.profile_view import render_profile_table
 _console = Console(stderr=True)
+FailOn = Literal["never", "unsafe", "review", "any"]
+def _should_fail(issues: Sequence[Issue], fail_on: FailOn) -> bool:
+    """Return whether profile findings should trip the requested CI gate."""
+    if fail_on == "never":
+        return False
+    if fail_on == "any":
+        return bool(issues)
+    severities = [issue.severity for issue in issues]
+    if fail_on == "unsafe":
+        return any(severity == Severity.UNSAFE for severity in severities)
+    return any(severity >= Severity.REVIEW for severity in severities)
 def profile(
     path: Annotated[
         Path,
         typer.Argument(
             help="Path to the CSV file to profile.",
         ),
     ],
         Path | None,
         typer.Option(
             "--schema",
             help="Path to a YAML schema file with column types and FDs.",
         ),
     ] = None,
+    json_output: Annotated[
+        bool,
+        typer.Option("--json", help="Print profile results as JSON."),
+    ] = False,
+    fail_on: Annotated[
+        FailOn,
+        typer.Option(
+            "--fail-on",
+            help="Exit 1 when findings meet this threshold: never, unsafe, review, any.",
+        ),
+    ] = "never",
 ) -> None:
     """Profile a CSV file for data-quality issues.
     Reads the CSV, runs all detectors (type_mismatch, decimal_shift,
     fd_violation), and renders a rich-formatted table of detected issues.
+    Exit code 0 unless ``--fail-on`` is set and matching findings are present.
     """
+    resolved_path = resolve_cli_path(path)
+    if not resolved_path.exists():
+        _console.print(f"[bold red]CSV file not found:[/bold red] {path}")
+        raise typer.Exit(code=2)
     # Load the CSV with dtype=str to avoid pandas type-coercion artifacts.
     try:
+        df = read_csv(resolved_path)
     except Exception as exc:
         _console.print(f"[bold red]Error reading CSV:[/bold red] {exc}")
         raise typer.Exit(code=2) from exc
     # Optionally load schema.
     parsed_schema: Schema | None = None
     if schema is not None:
+        resolved_schema = resolve_cli_path(schema)
+        if not resolved_schema.exists():
+            _console.print(f"[bold red]Schema file not found:[/bold red] {schema}")
+            raise typer.Exit(code=2)
+        parsed_schema = load_schema(resolved_schema)
     # Run all detectors.
     issues = run_all_detectors(df, parsed_schema)
     # Render the results.
+    if json_output:
+        typer.echo(
+            json.dumps(
+                {
+                    "path": str(resolved_path),
+                    "issues_count": len(issues),
+                    "fail_on": fail_on,
+                    "issues": [issue.model_dump(mode="json") for issue in issues],
+                },
+                indent=2,
+                sort_keys=True,
+            )
+        )
+    else:
+        output_console = Console()
+        render_profile_table(issues, output_console, file_path=str(resolved_path))
+    if _should_fail(issues, fail_on):
         raise typer.Exit(code=1)

dataforge/cli/release.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""CLI group for local release verification."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Annotated
+import typer
+from dataforge.release.doctor import DEFAULT_KAGGLE_CREDENTIALS, run_doctor
+release_app = typer.Typer(help="Release verification utilities.", no_args_is_help=True)
+@release_app.command(name="doctor")
+def doctor(
+    json_output: Annotated[
+        bool,
+        typer.Option("--json", help="Print machine-readable JSON."),
+    ] = False,
+    kaggle_credentials: Annotated[
+        Path,
+        typer.Option(
+            "--kaggle-credentials",
+            help="Path to Kaggle OAuth credentials.json. Legacy kaggle.json is never read.",
+        ),
+    ] = DEFAULT_KAGGLE_CREDENTIALS,
+) -> None:
+    """Verify local release/deploy auth without printing secrets."""
+    report = run_doctor(kaggle_credentials=kaggle_credentials)
+    if json_output:
+        typer.echo(json.dumps(report.to_dict(), indent=2, sort_keys=True))
+    else:
+        for check in report.checks:
+            status = "ok" if check.ok else "fail"
+            typer.echo(f"{status:4} {check.name}: {check.detail}")
+    raise typer.Exit(code=0 if report.ok else 2)

dataforge/cli/repair.py CHANGED Viewed

@@ -2,32 +2,25 @@
 from __future__ import annotations
-import hashlib
-from datetime import UTC, datetime
 from pathlib import Path
-from typing import Annotated
-import pandas as pd
 import typer
 from rich.console import Console
 from rich.panel import Panel
-from dataforge.cli.common import load_schema, read_csv
-from dataforge.detectors import run_all_detectors
 from dataforge.detectors.base import Issue, Schema
-from dataforge.repairers import build_repairers
-from dataforge.repairers.base import ProposedFix, RepairAttempt, RetryContext
-from dataforge.safety import SafetyContext, SafetyFilter, SafetyResult, SafetyVerdict
-from dataforge.transactions.log import (
-    append_applied_event,
-    append_created_transaction,
-    cache_dir_for,
-    sha256_bytes,
-    snapshot_path_for,
-)
-from dataforge.transactions.txn import CellFix, RepairTransaction, generate_txn_id
 from dataforge.ui.repair_diff import render_repair_diff
-from dataforge.verifier import SMTVerifier, VerificationVerdict
 _console = Console(stderr=True)
@@ -45,32 +38,19 @@ def apply_fixes_to_csv(path: Path, fixes: list[CellFix]) -> str:
     Raises:
         ValueError: If a fix references a missing row/column or stale old value.
     """
-    df = read_csv(path)
-    for fix in fixes:
-        if fix.operation != "update":
-            raise ValueError(f"Unsupported repair operation '{fix.operation}' for row {fix.row}.")
-        if fix.column not in df.columns:
-            raise ValueError(f"Column '{fix.column}' not found in '{path}'.")
-        if fix.row < 0 or fix.row >= len(df.index):
-            raise ValueError(f"Row {fix.row} is out of bounds for '{path}'.")
-        current_value = str(df.at[fix.row, fix.column])
-        if current_value != fix.old_value:
-            raise ValueError(
-                f"Refusing to apply stale fix for row {fix.row}, column '{fix.column}': "
-                f"expected '{fix.old_value}', found '{current_value}'."
-            )
-        df.at[fix.row, fix.column] = fix.new_value
-    df.to_csv(path, index=False, lineterminator="\n")
-    return hashlib.sha256(path.read_bytes()).hexdigest()
 def _resolve_schema(schema_path: Path | None) -> Schema | None:
     """Resolve an optional schema path into a parsed Schema."""
     if schema_path is None:
         return None
-    return load_schema(schema_path)
 def _print_error(message: str, *, hint: str | None = None) -> None:
@@ -94,157 +74,21 @@ def _propose_repairs(
     confirm_escalations: bool,
     interactive: bool,
 ) -> tuple[list[ProposedFix], list[list[RepairAttempt]]]:
-    """Run repairers and gates issue-by-issue against the working dataframe."""
-    repairers = build_repairers(
-        cache_dir=cache_dir_for(path),
         allow_llm=allow_llm,
         model=model,
-    )
-    safety_filter = SafetyFilter()
-    verifier = SMTVerifier()
-    safety_context = SafetyContext(
         allow_pii=allow_pii,
         confirm_pii=confirm_pii,
         confirm_escalations=confirm_escalations,
-    )
-    accepted_fixes: list[ProposedFix] = []
-    attempt_groups: list[list[RepairAttempt]] = []
-    for issue in issues:
-        attempts: list[RepairAttempt] = []
-        repairer = repairers.get(issue.issue_type)
-        if repairer is None:
-            attempts.append(
-                RepairAttempt(
-                    issue=issue,
-                    attempt_number=1,
-                    status="attempted_not_fixed",
-                    reason="No repairer is registered for this issue type.",
-                )
-            )
-            attempt_groups.append(attempts)
-            continue
-        accepted = False
-        retry_context = RetryContext(issue=issue)
-        for attempt_number in range(1, 4):
-            candidate = repairer.propose(issue, working_df, schema, retry_context=retry_context)
-            if candidate is None:
-                attempts.append(
-                    RepairAttempt(
-                        issue=issue,
-                        attempt_number=attempt_number,
-                        status="attempted_not_fixed",
-                        reason="No repair proposal was available for this issue.",
-                    )
-                )
-                break
-            preferred = safety_filter.choose_preferred([candidate], schema, safety_context)
-            safety_result = safety_filter.evaluate(preferred, schema, safety_context)
-            if safety_result.verdict == SafetyVerdict.ESCALATE and interactive:
-                safety_context, safety_result = _resolve_escalation(
-                    preferred,
-                    schema,
-                    safety_context,
-                    safety_filter,
-                    safety_result,
-                )
-            if safety_result.verdict == SafetyVerdict.DENY:
-                attempts.append(
-                    RepairAttempt(
-                        issue=issue,
-                        attempt_number=attempt_number,
-                        fix=preferred,
-                        status="denied",
-                        reason=safety_result.reason,
-                    )
-                )
-                retry_context = _build_retry_context(issue, attempts)
-                continue
-            if safety_result.verdict == SafetyVerdict.ESCALATE:
-                attempts.append(
-                    RepairAttempt(
-                        issue=issue,
-                        attempt_number=attempt_number,
-                        fix=preferred,
-                        status="escalated",
-                        reason=safety_result.reason,
-                    )
-                )
-                break
-            verifier_result = verifier.verify(working_df, [preferred], schema)
-            if verifier_result.verdict == VerificationVerdict.ACCEPT:
-                accepted_fixes.append(preferred)
-                working_df.at[preferred.fix.row, preferred.fix.column] = preferred.fix.new_value
-                attempts.append(
-                    RepairAttempt(
-                        issue=issue,
-                        attempt_number=attempt_number,
-                        fix=preferred,
-                        status="accepted",
-                        reason=verifier_result.reason,
-                    )
-                )
-                accepted = True
-                break
-            attempts.append(
-                RepairAttempt(
-                    issue=issue,
-                    attempt_number=attempt_number,
-                    fix=preferred,
-                    status=(
-                        "rejected"
-                        if verifier_result.verdict == VerificationVerdict.REJECT
-                        else "unknown"
-                    ),
-                    reason=verifier_result.reason,
-                    unsat_core=verifier_result.unsat_core,
-                )
-            )
-            retry_context = _build_retry_context(issue, attempts)
-        if (
-            not accepted
-            and attempts
-            and attempts[-1].status not in {"attempted_not_fixed", "escalated"}
-        ):
-            last_reason = attempts[-1].reason
-            attempts[-1] = attempts[-1].model_copy(
-                update={
-                    "status": "attempted_not_fixed",
-                    "reason": (
-                        f"Issue was attempted but not fixed after {len(attempts)} attempt(s). "
-                        f"Last failure: {last_reason}"
-                    ),
-                }
-            )
-        attempt_groups.append(attempts)
-    return accepted_fixes, attempt_groups
-def _build_retry_context(issue: Issue, attempts: list[RepairAttempt]) -> RetryContext:
-    """Build retry hints from previous failed attempts."""
-    rejected_values = frozenset(
-        attempt.fix.fix.new_value
-        for attempt in attempts
-        if attempt.fix is not None and attempt.status in {"denied", "rejected", "unknown"}
-    )
-    hints: list[str] = []
-    for attempt in attempts:
-        hints.append(attempt.reason)
-        hints.extend(attempt.unsat_core)
-    return RetryContext(
-        issue=issue,
-        previous_attempts=tuple(attempts),
-        rejected_values=rejected_values,
-        hints=tuple(hints),
     )
@@ -309,45 +153,46 @@ def _render_attempt_summary(
     return len(failed_groups)
 def _apply_transaction(
     path: Path,
     fixes: list[ProposedFix],
     source_bytes: bytes,
 ) -> str:
-    """Write a transaction record, apply fixes, and append the applied event."""
-    resolved_path = path.resolve()
-    txn_id = generate_txn_id()
-    snapshot_path = snapshot_path_for(resolved_path, txn_id)
-    snapshot_path.parent.mkdir(parents=True, exist_ok=True)
-    snapshot_path.write_bytes(source_bytes)
-    transaction = RepairTransaction(
-        txn_id=txn_id,
-        created_at=datetime.now(UTC),
-        source_path=str(resolved_path),
-        source_sha256=sha256_bytes(source_bytes),
-        source_snapshot_path=str(snapshot_path.resolve()),
-        fixes=[proposal.fix for proposal in fixes],
-        applied=False,
-    )
-    log_path = append_created_transaction(transaction)
-    try:
-        post_sha256 = apply_fixes_to_csv(path, [proposal.fix for proposal in fixes])
-        append_applied_event(log_path, txn_id, post_sha256=post_sha256)
-    except Exception:
-        path.write_bytes(source_bytes)
-        raise
-    return txn_id
 def repair(
     path: Annotated[
         Path,
         typer.Argument(
-            exists=True,
-            readable=True,
             help="Path to the CSV file to repair.",
         ),
     ],
@@ -355,8 +200,6 @@ def repair(
         Path | None,
         typer.Option(
             "--schema",
-            exists=True,
-            readable=True,
             help="Path to a YAML schema file with column types and FDs.",
         ),
     ] = None,
@@ -400,6 +243,10 @@ def repair(
         str,
         typer.Option("--llm-model", help="Model name for fd_violation LLM fallback."),
     ] = "gemini-2.0-flash",
 ) -> None:
     """Detect, propose, and optionally apply reversible repairs to a CSV."""
     if dry_run == apply:
@@ -410,58 +257,66 @@ def repair(
         raise typer.Exit(code=2)
     try:
         parsed_schema = _resolve_schema(schema)
-        df = read_csv(path)
     except Exception as exc:
         _print_error(str(exc))
         raise typer.Exit(code=2) from exc
-    issues = run_all_detectors(df, parsed_schema)
-    accepted_fixes, attempt_groups = _propose_repairs(
-        issues,
-        path,
-        df.copy(deep=True),
-        parsed_schema,
-        allow_llm=allow_llm,
-        model=llm_model,
-        allow_pii=allow_pii,
-        confirm_pii=confirm_pii,
-        confirm_escalations=confirm_escalations,
-        interactive=apply,
-    )
-    output_console = Console()
-    render_repair_diff(accepted_fixes, output_console, file_path=str(path))
-    failed_issue_count = _render_attempt_summary(attempt_groups, output_console)
-    if not accepted_fixes and failed_issue_count == 0:
         raise typer.Exit(code=1)
     if dry_run:
-        raise typer.Exit(code=0 if accepted_fixes else 1)
-    if not accepted_fixes:
-        raise typer.Exit(code=1)
-    batch_safety = SafetyFilter().evaluate_batch(accepted_fixes)
-    if batch_safety.verdict != SafetyVerdict.ALLOW:
-        _print_error(batch_safety.reason)
         raise typer.Exit(code=1)
-    source_bytes = path.read_bytes()
-    try:
-        txn_id = _apply_transaction(path, accepted_fixes, source_bytes)
-    except Exception as exc:
-        _print_error(
-            f"Failed to apply repairs: {exc}",
-            hint="The source file was restored to its pre-apply bytes.",
-        )
-        raise typer.Exit(code=1) from exc
     output_console.print(
         Panel(
-            f"[green]Applied {len(accepted_fixes)} fix(es).[/green]\n"
-            f"Transaction ID: [bold]{txn_id}[/bold]",
             title="Repair Applied",
             style="green",
         )

 from __future__ import annotations
+import json
 from pathlib import Path
+from typing import TYPE_CHECKING, Annotated
 import typer
 from rich.console import Console
 from rich.panel import Panel
+from dataforge.cli.common import load_schema, resolve_cli_path
 from dataforge.detectors.base import Issue, Schema
+from dataforge.repairers.base import ProposedFix, RepairAttempt
+from dataforge.safety import SafetyContext, SafetyFilter, SafetyResult
+from dataforge.transactions.txn import CellFix
 from dataforge.ui.repair_diff import render_repair_diff
+if TYPE_CHECKING:
+    import pandas as pd
+    from dataforge.engine.repair import RepairPipelineResult
 _console = Console(stderr=True)
     Raises:
         ValueError: If a fix references a missing row/column or stale old value.
     """
+    from dataforge.engine.repair import apply_fixes_to_csv as engine_apply_fixes_to_csv
+    return engine_apply_fixes_to_csv(path, fixes)
 def _resolve_schema(schema_path: Path | None) -> Schema | None:
     """Resolve an optional schema path into a parsed Schema."""
     if schema_path is None:
         return None
+    resolved_schema = resolve_cli_path(schema_path)
+    if not resolved_schema.exists():
+        raise typer.BadParameter(f"Schema file '{schema_path}' does not exist.")
+    return load_schema(resolved_schema)
 def _print_error(message: str, *, hint: str | None = None) -> None:
     confirm_escalations: bool,
     interactive: bool,
 ) -> tuple[list[ProposedFix], list[list[RepairAttempt]]]:
+    """Compatibility wrapper around the shared repair engine proposal stage."""
+    from dataforge.engine.repair import propose_repairs as engine_propose_repairs
+    return engine_propose_repairs(
+        issues,
+        path,
+        working_df,
+        schema,
         allow_llm=allow_llm,
         model=model,
         allow_pii=allow_pii,
         confirm_pii=confirm_pii,
         confirm_escalations=confirm_escalations,
+        interactive=interactive,
+        escalation_resolver=_resolve_escalation,
     )
     return len(failed_groups)
+def _render_failure_summary(result: RepairPipelineResult, console: Console) -> int:
+    """Render a summary for issues that the shared engine could not repair."""
+    if not result.failures:
+        return 0
+    console.print("[bold yellow]Attempted But Not Fixed[/bold yellow]")
+    for failure in result.failures:
+        prefix = ""
+        if any(label.startswith("fd::") for label in failure.unsat_core):
+            prefix = "functional dependency rejection - "
+        elif any(label.startswith("domain::") for label in failure.unsat_core):
+            prefix = "domain bound rejection - "
+        console.print(
+            f"{failure.issue_type} at {failure.row}:{failure.column} "
+            f"after {failure.attempt_count} attempt(s): {prefix}{failure.reason}",
+            overflow="fold",
+        )
+    return len(result.failures)
+def _json_result(result: RepairPipelineResult) -> str:
+    """Serialize a repair result for CLI/MCP/CI consumers."""
+    return json.dumps(result.model_dump(mode="json"), indent=2, sort_keys=True)
 def _apply_transaction(
     path: Path,
     fixes: list[ProposedFix],
     source_bytes: bytes,
 ) -> str:
+    """Compatibility wrapper around the shared repair engine transaction path."""
+    from dataforge.engine.repair import apply_transaction as engine_apply_transaction
+    return engine_apply_transaction(path, fixes, source_bytes)
 def repair(
     path: Annotated[
         Path,
         typer.Argument(
             help="Path to the CSV file to repair.",
         ),
     ],
         Path | None,
         typer.Option(
             "--schema",
             help="Path to a YAML schema file with column types and FDs.",
         ),
     ] = None,
         str,
         typer.Option("--llm-model", help="Model name for fd_violation LLM fallback."),
     ] = "gemini-2.0-flash",
+    json_output: Annotated[
+        bool,
+        typer.Option("--json", help="Print repair result as JSON."),
+    ] = False,
 ) -> None:
     """Detect, propose, and optionally apply reversible repairs to a CSV."""
     if dry_run == apply:
         raise typer.Exit(code=2)
     try:
+        resolved_path = resolve_cli_path(path)
+        if not resolved_path.exists():
+            raise typer.BadParameter(f"CSV file '{path}' does not exist.")
         parsed_schema = _resolve_schema(schema)
     except Exception as exc:
         _print_error(str(exc))
         raise typer.Exit(code=2) from exc
+    try:
+        from dataforge.engine.repair import RepairPipelineRequest, run_repair_pipeline
+        result = run_repair_pipeline(
+            RepairPipelineRequest(
+                source_path=resolved_path,
+                mode="apply" if apply else "dry_run",
+                schema=parsed_schema,
+                allow_llm=allow_llm,
+                model=llm_model,
+                allow_pii=allow_pii,
+                confirm_pii=confirm_pii,
+                confirm_escalations=confirm_escalations,
+                interactive=apply,
+            )
+        )
+    except Exception as exc:
+        _print_error(
+            f"Failed to apply repairs: {exc}" if apply else f"Failed to repair: {exc}",
+            hint="The source file was restored to its pre-apply bytes." if apply else None,
+        )
+        raise typer.Exit(code=1 if apply else 2) from exc
+    if json_output:
+        typer.echo(_json_result(result))
+        raise typer.Exit(code=0 if result.fixes else 1)
+    output_console = Console()
+    render_repair_diff(result.fixes, output_console, file_path=str(resolved_path))
+    failed_issue_count = _render_failure_summary(result, output_console)
+    if not result.fixes and failed_issue_count == 0:
+        if result.receipt.reason != "No accepted fixes were produced.":
+            output_console.print(
+                Panel(
+                    f"[yellow]{result.receipt.reason}[/yellow]",
+                    title="Repair Summary",
+                    style="yellow",
+                )
+            )
         raise typer.Exit(code=1)
     if dry_run:
+        raise typer.Exit(code=0 if result.fixes else 1)
+    if not result.fixes or not result.receipt.applied:
         raise typer.Exit(code=1)
     output_console.print(
         Panel(
+            f"[green]Applied {len(result.fixes)} fix(es).[/green]\n"
+            f"Transaction ID: [bold]{result.receipt.txn_id}[/bold]",
             title="Repair Applied",
             style="green",
         )

dataforge/cli/watch.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""CLI subcommand: ``dataforge watch``."""
+from __future__ import annotations
+import json
+import time
+from pathlib import Path
+from typing import Annotated, Literal
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from dataforge.cli.common import load_schema, read_csv, resolve_cli_path
+from dataforge.detectors import run_all_detectors
+from dataforge.detectors.base import Schema
+from dataforge.ui.profile_view import render_profile_table
+from dataforge.ui.repair_diff import render_repair_diff
+_console = Console(stderr=True)
+WatchAction = Literal["profile", "repair"]
+def _load_optional_schema(schema_path: Path | None) -> Schema | None:
+    if schema_path is None:
+        return None
+    resolved_schema = resolve_cli_path(schema_path)
+    if not resolved_schema.exists():
+        raise typer.BadParameter(f"Schema file '{schema_path}' does not exist.")
+    return load_schema(resolved_schema)
+def _profile_once(path: Path, schema: Schema | None, json_output: bool) -> None:
+    df = read_csv(path)
+    issues = run_all_detectors(df, schema)
+    if json_output:
+        typer.echo(
+            json.dumps(
+                {
+                    "event": "profile",
+                    "path": str(path),
+                    "issues_count": len(issues),
+                    "issues": [issue.model_dump(mode="json") for issue in issues],
+                },
+                indent=2,
+                sort_keys=True,
+            )
+        )
+        return
+    render_profile_table(issues, Console(), file_path=str(path))
+def _repair_once(path: Path, schema: Schema | None, apply: bool, json_output: bool) -> None:
+    from dataforge.engine.repair import RepairPipelineRequest, run_repair_pipeline
+    result = run_repair_pipeline(
+        RepairPipelineRequest(
+            source_path=path,
+            mode="apply" if apply else "dry_run",
+            schema=schema,
+            interactive=False,
+        )
+    )
+    if json_output:
+        payload = result.model_dump(mode="json")
+        payload["event"] = "repair"
+        typer.echo(json.dumps(payload, indent=2, sort_keys=True))
+        return
+    render_repair_diff(result.fixes, Console(), file_path=str(path))
+def _run_once(path: Path, schema: Schema | None, action: WatchAction, apply: bool, json: bool) -> None:
+    if action == "repair":
+        _repair_once(path, schema, apply, json)
+    else:
+        _profile_once(path, schema, json)
+def watch(
+    path: Annotated[
+        Path,
+        typer.Argument(help="CSV or dbt artifact path to watch."),
+    ],
+    schema: Annotated[
+        Path | None,
+        typer.Option("--schema", help="Path to a YAML schema file with column types and FDs."),
+    ] = None,
+    action: Annotated[
+        WatchAction,
+        typer.Option("--action", help="Action to run when the file changes: profile or repair."),
+    ] = "profile",
+    apply: Annotated[
+        bool,
+        typer.Option("--apply", help="Apply repairs on change. Defaults to dry-run repair."),
+    ] = False,
+    interval: Annotated[
+        float,
+        typer.Option("--interval", min=0.1, help="Polling interval in seconds."),
+    ] = 2.0,
+    once: Annotated[
+        bool,
+        typer.Option("--once", help="Run once and exit, useful for CI acceptance."),
+    ] = False,
+    json_output: Annotated[
+        bool,
+        typer.Option("--json", help="Print watch events as JSON."),
+    ] = False,
+) -> None:
+    """Poll a path and rerun profile or repair when it changes."""
+    resolved_path = resolve_cli_path(path)
+    if not resolved_path.exists():
+        _console.print(f"[bold red]Watch path not found:[/bold red] {path}")
+        raise typer.Exit(code=2)
+    parsed_schema = _load_optional_schema(schema)
+    if apply and action != "repair":
+        _console.print(
+            Panel(
+                "--apply is only valid with --action repair.",
+                title="Watch Error",
+                style="red",
+            )
+        )
+        raise typer.Exit(code=2)
+    _run_once(resolved_path, parsed_schema, action, apply, json_output)
+    if once:
+        return
+    last_mtime = resolved_path.stat().st_mtime_ns
+    while True:
+        time.sleep(interval)
+        try:
+            current_mtime = resolved_path.stat().st_mtime_ns
+        except FileNotFoundError:
+            _console.print(f"[bold red]Watch path disappeared:[/bold red] {resolved_path}")
+            raise typer.Exit(code=2) from None
+        if current_mtime == last_mtime:
+            continue
+        last_mtime = current_mtime
+        _run_once(resolved_path, parsed_schema, action, apply, json_output)

dataforge/datasets/embedded/hospital/clean.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+id,age,admission_date,name
+1,30,2020-01-01,Alice
+2,45,2020-01-02,Bob
+3,30,2020-01-03,Carol
+4,29,2020-01-04,Dave
+5,35,2020-01-05,Eve
+6,51,2020-01-06,Frank
+7,40,2020-01-07,Grace
+8,35,2020-01-08,Heidi
+9,28,2020-01-09,Ivan
+10,60,2020-01-10,Judy

dataforge/datasets/embedded/hospital/dirty.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+id,age,admission_date,name
+1,30,2020-01-01,Alice
+2,45,2020-01-02,Bob
+3,N/A,2020-01-03,Carol
+4,29,2020-01-04,Dave
+5,null,2020-01-05,Eve
+6,51,2020-01-06,Frank
+7,40,2020-01-07,Grace
+8,35,2020-01-08,Heidi
+9,28,2020-01-09,Ivan
+10,60,2020-01-10,Judy

dataforge/datasets/real_world.py CHANGED Viewed

@@ -2,6 +2,8 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from pathlib import Path
@@ -16,6 +18,9 @@ class DatasetDownloadError(RuntimeError):
     """Raised when a real-world dataset cannot be downloaded or loaded from cache."""
 class GroundTruthCell(BaseModel):
     """Single cell-level dirty-to-clean correction used for benchmark scoring."""
@@ -57,7 +62,11 @@ def _read_cached_csv(path: Path) -> pd.DataFrame:
 def _download_bytes(url: str) -> bytes:
     """Download raw CSV bytes from an upstream source URL."""
-    with httpx.Client(timeout=60.0, follow_redirects=True) as client:
         response = client.get(url)
         response.raise_for_status()
     return response.content
@@ -67,8 +76,19 @@ def _download_to_cache(metadata: DatasetMetadata, dataset_dir: Path) -> None:
     """Download dirty/clean CSV files into the dataset cache directory."""
     dataset_dir.mkdir(parents=True, exist_ok=True)
     dirty_url, clean_url = metadata.source_urls
     (dataset_dir / "dirty.csv").write_bytes(_download_bytes(dirty_url))
     (dataset_dir / "clean.csv").write_bytes(_download_bytes(clean_url))
 def _manual_download_message(metadata: DatasetMetadata, dataset_dir: Path, cause: Exception) -> str:
@@ -153,16 +173,26 @@ def load_real_world_dataset(
     dirty_path = dataset_dir / "dirty.csv"
     clean_path = dataset_dir / "clean.csv"
     if not dirty_path.exists() or not clean_path.exists():
         try:
             _download_to_cache(metadata, dataset_dir)
         except Exception as exc:  # pragma: no cover - exercised through tests via monkeypatch
-            raise DatasetDownloadError(
-                _manual_download_message(metadata, dataset_dir, exc)
-            ) from exc
-    dirty_df = _read_cached_csv(dirty_path)
-    clean_df = _read_cached_csv(clean_path)
     if len(dirty_df.index) != len(clean_df.index):
         raise ValueError(f"Dataset '{name}' dirty/clean row counts do not match.")

 from __future__ import annotations
+import logging
+import os
 from dataclasses import dataclass
 from pathlib import Path
     """Raised when a real-world dataset cannot be downloaded or loaded from cache."""
+_LOGGER = logging.getLogger("dataforge.datasets.real_world")
 class GroundTruthCell(BaseModel):
     """Single cell-level dirty-to-clean correction used for benchmark scoring."""
 def _download_bytes(url: str) -> bytes:
     """Download raw CSV bytes from an upstream source URL."""
+    try:
+        timeout = float(os.environ.get("DATAFORGE_DOWNLOAD_TIMEOUT_S", "5"))
+    except ValueError:
+        timeout = 5.0
+    with httpx.Client(timeout=timeout, follow_redirects=True) as client:
         response = client.get(url)
         response.raise_for_status()
     return response.content
     """Download dirty/clean CSV files into the dataset cache directory."""
     dataset_dir.mkdir(parents=True, exist_ok=True)
     dirty_url, clean_url = metadata.source_urls
+    _LOGGER.info("dataset_download_start name=%s dir=%s", metadata.name, dataset_dir)
     (dataset_dir / "dirty.csv").write_bytes(_download_bytes(dirty_url))
     (dataset_dir / "clean.csv").write_bytes(_download_bytes(clean_url))
+    _LOGGER.info("dataset_download_complete name=%s dir=%s", metadata.name, dataset_dir)
+def _load_embedded_dataset(name: str) -> tuple[pd.DataFrame, pd.DataFrame] | None:
+    root = Path(__file__).parent / "embedded" / name
+    dirty_path = root / "dirty.csv"
+    clean_path = root / "clean.csv"
+    if not dirty_path.exists() or not clean_path.exists():
+        return None
+    return _read_cached_csv(dirty_path), _read_cached_csv(clean_path)
 def _manual_download_message(metadata: DatasetMetadata, dataset_dir: Path, cause: Exception) -> str:
     dirty_path = dataset_dir / "dirty.csv"
     clean_path = dataset_dir / "clean.csv"
+    dirty_df: pd.DataFrame | None = None
+    clean_df: pd.DataFrame | None = None
     if not dirty_path.exists() or not clean_path.exists():
+        _LOGGER.info("dataset_cache_miss name=%s dir=%s", name, dataset_dir)
         try:
             _download_to_cache(metadata, dataset_dir)
         except Exception as exc:  # pragma: no cover - exercised through tests via monkeypatch
+            fallback = _load_embedded_dataset(name)
+            if fallback is None:
+                raise DatasetDownloadError(
+                    _manual_download_message(metadata, dataset_dir, exc)
+                ) from exc
+            dirty_df, clean_df = fallback
+    else:
+        _LOGGER.info("dataset_cache_hit name=%s dir=%s", name, dataset_dir)
+    if dirty_df is None or clean_df is None:
+        dirty_df = _read_cached_csv(dirty_path)
+        clean_df = _read_cached_csv(clean_path)
     if len(dirty_df.index) != len(clean_df.index):
         raise ValueError(f"Dataset '{name}' dirty/clean row counts do not match.")

dataforge/detectors/__init__.py CHANGED Viewed

@@ -12,8 +12,6 @@ deduplicated, severity-sorted issue list.
 from __future__ import annotations
-import pandas as pd
 from dataforge.detectors.base import Detector, Issue, Schema, Severity
 from dataforge.detectors.decimal_shift import DecimalShiftDetector
 from dataforge.detectors.fd_violation import FDViolationDetector
@@ -33,14 +31,14 @@ __all__ = [
 _SEVERITY_ORDER = {Severity.UNSAFE: 0, Severity.REVIEW: 1, Severity.SAFE: 2}
-def run_all_detectors(df: pd.DataFrame, schema: Schema | None = None) -> list[Issue]:
     """Run all registered detectors and return a merged, sorted issue list.
     Issues are deduplicated by (row, column, issue_type) and sorted by
     severity (UNSAFE first) then confidence (highest first).
     Args:
-        df: The input DataFrame to analyze.
         schema: Optional declared schema with column types and constraints.
     Returns:

 from __future__ import annotations
 from dataforge.detectors.base import Detector, Issue, Schema, Severity
 from dataforge.detectors.decimal_shift import DecimalShiftDetector
 from dataforge.detectors.fd_violation import FDViolationDetector
 _SEVERITY_ORDER = {Severity.UNSAFE: 0, Severity.REVIEW: 1, Severity.SAFE: 2}
+def run_all_detectors(df: object, schema: Schema | None = None) -> list[Issue]:
     """Run all registered detectors and return a merged, sorted issue list.
     Issues are deduplicated by (row, column, issue_type) and sorted by
     severity (UNSAFE first) then confidence (highest first).
     Args:
+        df: The input table to analyze.
         schema: Optional declared schema with column types and constraints.
     Returns:

dataforge/detectors/base.py CHANGED Viewed

@@ -5,9 +5,9 @@ from __future__ import annotations
 import enum
 from typing import Literal, Protocol
-import pandas as pd
 from pydantic import BaseModel, Field
 from dataforge.verifier.schema import (
     AggregateDependency,
     DomainBound,
@@ -114,23 +114,23 @@ class Issue(BaseModel):
 class Detector(Protocol):
     """Structural protocol that every detector must implement.
-    A detector is a pure function over tabular data: it receives a DataFrame
     and an optional Schema, and returns a list of Issue objects. No LLM calls,
     no disk I/O, no side effects.
     Example:
         >>> class MyDetector:
         ...     def detect(
-        ...         self, df: pd.DataFrame, schema: Schema | None = None
         ...     ) -> list[Issue]:
         ...         return []
     """
-    def detect(self, df: pd.DataFrame, schema: Schema | None = None) -> list[Issue]:
         """Detect data-quality issues in the given DataFrame.
         Args:
-            df: The input DataFrame to analyze.
             schema: Optional declared schema with column types and constraints.
         Returns:

 import enum
 from typing import Literal, Protocol
 from pydantic import BaseModel, Field
+from dataforge.table import TableLike
 from dataforge.verifier.schema import (
     AggregateDependency,
     DomainBound,
 class Detector(Protocol):
     """Structural protocol that every detector must implement.
+    A detector is a pure function over tabular data: it receives a table
     and an optional Schema, and returns a list of Issue objects. No LLM calls,
     no disk I/O, no side effects.
     Example:
         >>> class MyDetector:
         ...     def detect(
+        ...         self, df: TableLike, schema: Schema | None = None
         ...     ) -> list[Issue]:
         ...         return []
     """
+    def detect(self, df: TableLike, schema: Schema | None = None) -> list[Issue]:
         """Detect data-quality issues in the given DataFrame.
         Args:
+            df: The input table to analyze.
             schema: Optional declared schema with column types and constraints.
         Returns:

dataforge/detectors/decimal_shift.py CHANGED Viewed

@@ -10,15 +10,10 @@ The detector is **pure**: no LLM calls, no I/O, no side effects.
 from __future__ import annotations
 import math
-from typing import TYPE_CHECKING
-import numpy as np
-import pandas as pd
 from dataforge.detectors.base import Issue, Schema, Severity
-if TYPE_CHECKING:
-    pass
 # Minimum non-null numeric values required for meaningful statistics.
 _MIN_COLUMN_SIZE = 5
@@ -70,7 +65,7 @@ class DecimalShiftDetector:
         3
     """
-    def detect(self, df: pd.DataFrame, schema: Schema | None = None) -> list[Issue]:
         """Detect decimal-shift issues in the DataFrame.
         Args:
@@ -83,13 +78,13 @@ class DecimalShiftDetector:
         """
         issues: list[Issue] = []
-        for col_name in df.columns:
             col_issues = self._check_column(df, str(col_name))
             issues.extend(col_issues)
         return issues
-    def _check_column(self, df: pd.DataFrame, col_name: str) -> list[Issue]:
         """Check a single column for decimal-shift outliers.
         Args:
@@ -101,7 +96,7 @@ class DecimalShiftDetector:
         """
         # Parse all values to float, keeping track of original indices.
         parsed: list[tuple[int, float, str]] = []
-        for row_idx, val in enumerate(df[col_name].tolist()):
             fval = _try_float(val)
             if fval is not None:
                 parsed.append((row_idx, fval, str(val)))
@@ -109,11 +104,10 @@ class DecimalShiftDetector:
         if len(parsed) < _MIN_COLUMN_SIZE:
             return []
-        values = np.array([v for _, v, _ in parsed])
-        median = float(np.median(values))
         # If median is zero or very close, we cannot compute meaningful ratios.
-        if abs(median) < 1e-10:
             return []
         issues: list[Issue] = []
@@ -121,7 +115,7 @@ class DecimalShiftDetector:
             if abs(fval) < 1e-10:
                 continue
-            ratio = fval / median
             if abs(ratio) < 1e-10:
                 continue
@@ -147,13 +141,13 @@ class DecimalShiftDetector:
                     reason = (
                         f"Value {fval:g} in column '{col_name}' appears to be "
                         f"~{int(correction_factor)}x the typical value "
-                        f"(median ~{median:g})"
                     )
                 else:
                     reason = (
                         f"Value {fval:g} in column '{col_name}' appears to be "
                         f"~{1.0 / correction_factor:g}x too small compared to "
-                        f"the typical value (median ~{median:g})"
                     )
                 issues.append(

 from __future__ import annotations
 import math
+from statistics import median
 from dataforge.detectors.base import Issue, Schema, Severity
+from dataforge.table import TableLike, column_names, column_values
 # Minimum non-null numeric values required for meaningful statistics.
 _MIN_COLUMN_SIZE = 5
         3
     """
+    def detect(self, df: TableLike, schema: Schema | None = None) -> list[Issue]:
         """Detect decimal-shift issues in the DataFrame.
         Args:
         """
         issues: list[Issue] = []
+        for col_name in column_names(df):
             col_issues = self._check_column(df, str(col_name))
             issues.extend(col_issues)
         return issues
+    def _check_column(self, df: TableLike, col_name: str) -> list[Issue]:
         """Check a single column for decimal-shift outliers.
         Args:
         """
         # Parse all values to float, keeping track of original indices.
         parsed: list[tuple[int, float, str]] = []
+        for row_idx, val in enumerate(column_values(df, col_name)):
             fval = _try_float(val)
             if fval is not None:
                 parsed.append((row_idx, fval, str(val)))
         if len(parsed) < _MIN_COLUMN_SIZE:
             return []
+        center = float(median([v for _, v, _ in parsed]))
         # If median is zero or very close, we cannot compute meaningful ratios.
+        if abs(center) < 1e-10:
             return []
         issues: list[Issue] = []
             if abs(fval) < 1e-10:
                 continue
+            ratio = fval / center
             if abs(ratio) < 1e-10:
                 continue
                     reason = (
                         f"Value {fval:g} in column '{col_name}' appears to be "
                         f"~{int(correction_factor)}x the typical value "
+                        f"(median ~{center:g})"
                     )
                 else:
                     reason = (
                         f"Value {fval:g} in column '{col_name}' appears to be "
                         f"~{1.0 / correction_factor:g}x too small compared to "
+                        f"the typical value (median ~{center:g})"
                     )
                 issues.append(

dataforge/detectors/fd_violation.py CHANGED Viewed

@@ -12,14 +12,8 @@ The detector is **pure**: no LLM calls, no I/O, no side effects.
 from __future__ import annotations
-from typing import TYPE_CHECKING
-import pandas as pd
 from dataforge.detectors.base import Issue, Schema, Severity
-if TYPE_CHECKING:
-    pass
 class FDViolationDetector:
@@ -49,7 +43,7 @@ class FDViolationDetector:
         2
     """
-    def detect(self, df: pd.DataFrame, schema: Schema | None = None) -> list[Issue]:
         """Detect FD-violation issues in the DataFrame.
         Args:
@@ -73,7 +67,7 @@ class FDViolationDetector:
     def _check_fd(
         self,
-        df: pd.DataFrame,
         determinant: tuple[str, ...],
         dependent: str,
     ) -> list[Issue]:
@@ -91,34 +85,37 @@ class FDViolationDetector:
         # Verify all columns exist in the DataFrame.
         all_cols = [*determinant_columns, dependent]
         for col in all_cols:
-            if col not in df.columns:
                 return []
-        # Drop rows with null values in determinant columns.
-        subset = df[all_cols].copy()
-        mask = subset[determinant_columns].notna().all(axis=1)
-        subset = subset[mask]
-        if subset.empty:
             return []
-        # Group by determinant and find groups with multiple distinct
-        # dependent values.
         issues: list[Issue] = []
-        grouped = subset.groupby(determinant_columns, sort=False)
-        for group_key, group_df in grouped:
-            unique_deps = group_df[dependent].dropna().unique()
             if len(unique_deps) <= 1:
                 continue
-            # All rows in this group are part of the violation.
             det_desc = self._format_determinant(determinant, group_key)
             unique_str = ", ".join(repr(str(v)) for v in unique_deps)
-            for idx in group_df.index:
-                actual_val = str(group_df.at[idx, dependent])
                 reason = (
                     f"Functional dependency {determinant} -> {dependent} "
                     f"violated: {det_desc} maps to multiple values: "

 from __future__ import annotations
 from dataforge.detectors.base import Issue, Schema, Severity
+from dataforge.table import TableLike, cell_value, column_names, row_count
 class FDViolationDetector:
         2
     """
+    def detect(self, df: TableLike, schema: Schema | None = None) -> list[Issue]:
         """Detect FD-violation issues in the DataFrame.
         Args:
     def _check_fd(
         self,
+        df: TableLike,
         determinant: tuple[str, ...],
         dependent: str,
     ) -> list[Issue]:
         # Verify all columns exist in the DataFrame.
         all_cols = [*determinant_columns, dependent]
+        available_columns = set(column_names(df))
         for col in all_cols:
+            if col not in available_columns:
                 return []
+        groups: dict[tuple[str, ...], list[int]] = {}
+        for row in range(row_count(df)):
+            group_key = tuple(cell_value(df, row, column) for column in determinant_columns)
+            if any(value == "" for value in group_key):
+                continue
+            groups.setdefault(group_key, []).append(row)
+        if not groups:
             return []
         issues: list[Issue] = []
+        for group_key, row_indices in groups.items():
+            unique_deps: list[str] = []
+            for row in row_indices:
+                value = cell_value(df, row, dependent)
+                if value == "" or value in unique_deps:
+                    continue
+                unique_deps.append(value)
             if len(unique_deps) <= 1:
                 continue
             det_desc = self._format_determinant(determinant, group_key)
             unique_str = ", ".join(repr(str(v)) for v in unique_deps)
+            for idx in row_indices:
+                actual_val = cell_value(df, idx, dependent)
                 reason = (
                     f"Functional dependency {determinant} -> {dependent} "
                     f"violated: {det_desc} maps to multiple values: "

dataforge/detectors/type_mismatch.py CHANGED Viewed

@@ -10,14 +10,9 @@ The detector is **pure**: no LLM calls, no I/O, no side effects.
 from __future__ import annotations
 import re
-from typing import TYPE_CHECKING
-import pandas as pd
 from dataforge.detectors.base import Issue, Schema, Severity
-if TYPE_CHECKING:
-    pass
 # Compiled regexes for type inference.
 _NUMERIC_RE = re.compile(r"^[+-]?(\d+\.?\d*|\.\d+)([eE][+-]?\d+)?$")
@@ -69,7 +64,7 @@ class TypeMismatchDetector:
         'N/A'
     """
-    def detect(self, df: pd.DataFrame, schema: Schema | None = None) -> list[Issue]:
         """Detect type-mismatch issues in the DataFrame.
         Args:
@@ -84,13 +79,13 @@ class TypeMismatchDetector:
         """
         issues: list[Issue] = []
-        for col_name in df.columns:
             col_issues = self._check_column(df, str(col_name))
             issues.extend(col_issues)
         return issues
-    def _check_column(self, df: pd.DataFrame, col_name: str) -> list[Issue]:
         """Check a single column for type mismatches.
         Args:
@@ -100,12 +95,10 @@ class TypeMismatchDetector:
         Returns:
             Issues found in this column.
         """
-        series = df[col_name]
         # Collect (index, value, type) for non-null entries.
         classified: list[tuple[int, str, str]] = []
-        for row_idx, val in enumerate(series.tolist()):
-            if pd.isna(val):
                 continue
             str_val = str(val).strip()
             if not str_val:

 from __future__ import annotations
 import re
 from dataforge.detectors.base import Issue, Schema, Severity
+from dataforge.table import TableLike, column_names, column_values
 # Compiled regexes for type inference.
 _NUMERIC_RE = re.compile(r"^[+-]?(\d+\.?\d*|\.\d+)([eE][+-]?\d+)?$")
         'N/A'
     """
+    def detect(self, df: TableLike, schema: Schema | None = None) -> list[Issue]:
         """Detect type-mismatch issues in the DataFrame.
         Args:
         """
         issues: list[Issue] = []
+        for col_name in column_names(df):
             col_issues = self._check_column(df, str(col_name))
             issues.extend(col_issues)
         return issues
+    def _check_column(self, df: TableLike, col_name: str) -> list[Issue]:
         """Check a single column for type mismatches.
         Args:
         Returns:
             Issues found in this column.
         """
         # Collect (index, value, type) for non-null entries.
         classified: list[tuple[int, str, str]] = []
+        for row_idx, val in enumerate(column_values(df, col_name)):
+            if val is None:
                 continue
             str_val = str(val).strip()
             if not str_val:

dataforge/engine/__init__.py CHANGED Viewed

	@@ -1 +1,33 @@
1	- """~~Engine~~ ~~package~~ ~~scaffolding~~ for DataForge."""

+"""Public backend engine APIs for DataForge."""
+from dataforge.engine.repair import (
+    CandidateFix,
+    RepairFailure,
+    RepairMode,
+    RepairPipelineRequest,
+    RepairPipelineResult,
+    RepairReceipt,
+    VerifiedFix,
+    apply_fixes_to_csv,
+    apply_transaction,
+    create_repair_transaction,
+    propose_repairs,
+    run_repair_pipeline,
+    source_path_lock,
+)
+__all__ = [
+    "CandidateFix",
+    "RepairFailure",
+    "RepairMode",
+    "RepairPipelineRequest",
+    "RepairPipelineResult",
+    "RepairReceipt",
+    "VerifiedFix",
+    "apply_fixes_to_csv",
+    "apply_transaction",
+    "create_repair_transaction",
+    "propose_repairs",
+    "run_repair_pipeline",
+    "source_path_lock",
+]

dataforge/engine/repair.py ADDED Viewed

	@@ -0,0 +1,670 @@

+"""Public repair engine for DataForge backend surfaces.
+The engine is the stable boundary shared by CLI, Playground, MCP, and any
+OpenEnv adapter that needs repair semantics. It keeps the core invariant in one
+place: detect -> propose -> safety -> SMT verification -> journal/snapshot ->
+atomic mutation -> byte-identical revert.
+"""
+from __future__ import annotations
+import hashlib
+import os
+import secrets
+import time
+from collections.abc import Callable, Iterator
+from contextlib import contextmanager, suppress
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Literal
+from pydantic import BaseModel, ConfigDict, Field
+from dataforge.detectors import run_all_detectors
+from dataforge.detectors.base import Issue, Schema
+from dataforge.observability import repair_stage_span
+from dataforge.repair_contract import CONTRACT_VERSION
+from dataforge.repairers import build_repairers
+from dataforge.repairers.base import ProposedFix, RepairAttempt, RetryContext
+from dataforge.safety import SafetyContext, SafetyFilter, SafetyResult, SafetyVerdict
+from dataforge.table import (
+    Table,
+    TableLike,
+    cell_value,
+    column_names,
+    copy_table,
+    row_count,
+    set_cell_value,
+    table_to_csv_bytes,
+)
+from dataforge.table import (
+    read_csv as read_table_csv,
+)
+from dataforge.transactions.log import (
+    append_applied_event,
+    append_created_transaction,
+    cache_dir_for,
+    sha256_bytes,
+    sha256_file,
+    snapshot_path_for,
+)
+from dataforge.transactions.txn import CellFix, RepairTransaction, generate_txn_id
+from dataforge.verifier import SMTVerifier, VerificationVerdict
+RepairMode = Literal["dry_run", "apply"]
+EscalationResolver = Callable[
+    [ProposedFix, Schema | None, SafetyContext, SafetyFilter, SafetyResult],
+    tuple[SafetyContext, SafetyResult],
+]
+class RepairEngineError(RuntimeError):
+    """Base exception for public repair engine failures."""
+class TransactionApplyError(RepairEngineError):
+    """Raised when an apply transaction cannot be completed safely."""
+class CandidateFix(BaseModel):
+    """Stable public representation of a proposed cell repair."""
+    row: int = Field(ge=0)
+    column: str = Field(min_length=1)
+    old_value: str
+    new_value: str
+    detector_id: str = Field(min_length=1)
+    operation: Literal["update", "delete_row"] = "update"
+    reason: str = Field(min_length=1)
+    confidence: float = Field(ge=0.0, le=1.0)
+    provenance: str = Field(min_length=1)
+    model_config = ConfigDict(strict=True, extra="forbid", frozen=True)
+    @classmethod
+    def from_proposed(cls, proposed_fix: ProposedFix) -> CandidateFix:
+        """Create a public candidate from an internal repair proposal."""
+        fix = proposed_fix.fix
+        return cls(
+            row=fix.row,
+            column=fix.column,
+            old_value=fix.old_value,
+            new_value=fix.new_value,
+            detector_id=fix.detector_id,
+            operation=fix.operation,
+            reason=proposed_fix.reason,
+            confidence=proposed_fix.confidence,
+            provenance=proposed_fix.provenance,
+        )
+class VerifiedFix(CandidateFix):
+    """A candidate that passed safety and SMT verification."""
+    verifier_reason: str = Field(min_length=1)
+class RepairFailure(BaseModel):
+    """Machine-readable account of an issue that could not be repaired."""
+    row: int = Field(ge=0)
+    column: str = Field(min_length=1)
+    issue_type: str = Field(min_length=1)
+    status: str = Field(min_length=1)
+    reason: str = Field(min_length=1)
+    attempt_count: int = Field(ge=1)
+    unsat_core: tuple[str, ...] = Field(default_factory=tuple)
+    model_config = ConfigDict(strict=True, extra="forbid", frozen=True)
+    @classmethod
+    def from_attempts(cls, attempts: list[RepairAttempt]) -> RepairFailure:
+        """Build a public failure record from one issue's attempt trace."""
+        final = attempts[-1]
+        issue = final.issue
+        return cls(
+            row=issue.row,
+            column=issue.column,
+            issue_type=issue.issue_type,
+            status=final.status,
+            reason=final.reason,
+            attempt_count=len(attempts),
+            unsat_core=tuple(final.unsat_core),
+        )
+class RepairReceipt(BaseModel):
+    """Stable receipt for a dry-run or applied repair pipeline run."""
+    contract_version: str = CONTRACT_VERSION
+    mode: RepairMode
+    applied: bool
+    reversible: bool
+    source_path: str
+    source_sha256: str = Field(pattern=r"^[0-9a-f]{64}$")
+    post_sha256: str | None = Field(default=None, pattern=r"^[0-9a-f]{64}$")
+    txn_id: str | None = None
+    allowed_columns: list[str] = Field(default_factory=list)
+    valid_rows: list[int] = Field(default_factory=list)
+    issues_count: int = Field(ge=0)
+    fixes_count: int = Field(ge=0)
+    reason: str = Field(min_length=1)
+    model_config = ConfigDict(strict=True, extra="forbid", frozen=True)
+class RepairPipelineRequest(BaseModel):
+    """Input contract for running the public repair pipeline."""
+    source_path: Path
+    mode: RepairMode = "dry_run"
+    repair_schema: Schema | None = Field(default=None, alias="schema")
+    allow_llm: bool = False
+    model: str = "gemini-2.0-flash"
+    allow_pii: bool = False
+    confirm_pii: bool = False
+    confirm_escalations: bool = False
+    interactive: bool = False
+    create_dry_run_transaction: bool = False
+    model_config = ConfigDict(
+        strict=True,
+        arbitrary_types_allowed=True,
+        extra="forbid",
+        frozen=True,
+        populate_by_name=True,
+    )
+class RepairPipelineResult(BaseModel):
+    """Output contract for a public repair pipeline run."""
+    receipt: RepairReceipt
+    issues: list[Issue]
+    fixes: list[VerifiedFix]
+    failures: list[RepairFailure] = Field(default_factory=list)
+    transaction: RepairTransaction | None = None
+    model_config = ConfigDict(
+        strict=True, arbitrary_types_allowed=True, extra="forbid", frozen=True
+    )
+def _atomic_write_bytes(path: Path, payload: bytes) -> None:
+    """Write bytes to ``path`` through an atomic same-directory replacement."""
+    resolved = path.resolve()
+    resolved.parent.mkdir(parents=True, exist_ok=True)
+    temp_path = resolved.with_name(f".{resolved.name}.{secrets.token_hex(8)}.tmp")
+    try:
+        with temp_path.open("xb") as handle:
+            handle.write(payload)
+            handle.flush()
+            os.fsync(handle.fileno())
+        os.replace(temp_path, resolved)
+    finally:
+        if temp_path.exists():
+            temp_path.unlink()
+def read_csv(path: Path) -> Table:
+    """Read a CSV using conservative string-preserving defaults."""
+    return read_table_csv(path)
+def _csv_bytes_after_fixes(path: Path, fixes: list[CellFix]) -> bytes:
+    """Validate fixes against a CSV and return the mutated CSV bytes."""
+    df = read_csv(path)
+    for fix in fixes:
+        if fix.operation != "update":
+            raise ValueError(f"Unsupported repair operation '{fix.operation}' for row {fix.row}.")
+        if fix.column not in column_names(df):
+            raise ValueError(f"Column '{fix.column}' not found in '{path}'.")
+        if fix.row < 0 or fix.row >= row_count(df):
+            raise ValueError(f"Row {fix.row} is out of bounds for '{path}'.")
+        current_value = cell_value(df, fix.row, fix.column)
+        if current_value != fix.old_value:
+            raise ValueError(
+                f"Refusing to apply stale fix for row {fix.row}, column '{fix.column}': "
+                f"expected '{fix.old_value}', found '{current_value}'."
+            )
+        set_cell_value(df, fix.row, fix.column, fix.new_value)
+    return table_to_csv_bytes(df)
+def apply_fixes_to_csv(path: Path, fixes: list[CellFix]) -> str:
+    """Atomically apply ordered cell fixes to a CSV and return post-state SHA-256."""
+    payload = _csv_bytes_after_fixes(path, fixes)
+    _atomic_write_bytes(path, payload)
+    return hashlib.sha256(payload).hexdigest()
+def _lock_path_for(source_path: Path) -> Path:
+    """Return the filesystem lock path for a source file."""
+    digest = hashlib.sha256(str(source_path.resolve()).encode("utf-8")).hexdigest()[:24]
+    return source_path.resolve().parent / ".dataforge" / "locks" / f"{digest}.lock"
+@contextmanager
+def source_path_lock(
+    source_path: Path,
+    *,
+    timeout_seconds: float = 5.0,
+    stale_after_seconds: float = 300.0,
+) -> Iterator[None]:
+    """Acquire an exclusive lock for a source path using an atomic lock file."""
+    lock_path = _lock_path_for(source_path)
+    lock_path.parent.mkdir(parents=True, exist_ok=True)
+    deadline = time.monotonic() + timeout_seconds
+    while True:
+        try:
+            fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
+            try:
+                payload = f"{os.getpid()} {datetime.now(UTC).isoformat()}\n".encode()
+                os.write(fd, payload)
+            finally:
+                os.close(fd)
+            break
+        except FileExistsError as exc:
+            try:
+                age = time.time() - lock_path.stat().st_mtime
+            except OSError:
+                age = 0.0
+            if age > stale_after_seconds:
+                try:
+                    lock_path.unlink()
+                    continue
+                except OSError:
+                    pass
+            if time.monotonic() >= deadline:
+                raise TransactionApplyError(
+                    f"Timed out waiting for DataForge source lock: {source_path.resolve()}"
+                ) from exc
+            time.sleep(0.05)
+    try:
+        yield
+    finally:
+        with suppress(FileNotFoundError):
+            lock_path.unlink()
+def _write_snapshot_once(snapshot_path: Path, source_bytes: bytes) -> None:
+    """Write an immutable snapshot and fail if the transaction id already exists."""
+    snapshot_path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        with snapshot_path.open("xb") as handle:
+            handle.write(source_bytes)
+            handle.flush()
+            os.fsync(handle.fileno())
+    except FileExistsError as exc:
+        raise TransactionApplyError(
+            f"Transaction snapshot already exists: {snapshot_path}"
+        ) from exc
+def create_repair_transaction(
+    path: Path,
+    fixes: list[ProposedFix],
+    source_bytes: bytes,
+    *,
+    txn_id: str | None = None,
+) -> tuple[RepairTransaction, Path]:
+    """Create an unapplied transaction journal and immutable source snapshot."""
+    resolved_path = path.resolve()
+    transaction_id = txn_id or generate_txn_id()
+    snapshot_path = snapshot_path_for(resolved_path, transaction_id)
+    _write_snapshot_once(snapshot_path, source_bytes)
+    transaction = RepairTransaction(
+        txn_id=transaction_id,
+        created_at=datetime.now(UTC),
+        source_path=str(resolved_path),
+        source_sha256=sha256_bytes(source_bytes),
+        source_snapshot_path=str(snapshot_path.resolve()),
+        fixes=[proposal.fix for proposal in fixes],
+        applied=False,
+    )
+    try:
+        log_path = append_created_transaction(transaction)
+    except Exception:
+        snapshot_path.unlink(missing_ok=True)
+        raise
+    return transaction, log_path
+def apply_transaction(
+    path: Path,
+    fixes: list[ProposedFix],
+    source_bytes: bytes,
+    *,
+    txn_id: str | None = None,
+) -> str:
+    """Journal, snapshot, atomically apply fixes, and restore bytes on failure."""
+    resolved_path = path.resolve()
+    with source_path_lock(resolved_path):
+        current_bytes = resolved_path.read_bytes()
+        if current_bytes != source_bytes:
+            raise TransactionApplyError(
+                "Refusing to apply repairs because the source file changed after detection."
+            )
+        with repair_stage_span("dataforge.repair.transaction.create", fixes_count=len(fixes)):
+            transaction, log_path = create_repair_transaction(
+                resolved_path,
+                fixes,
+                source_bytes,
+                txn_id=txn_id,
+            )
+        try:
+            with repair_stage_span("dataforge.repair.transaction.apply", fixes_count=len(fixes)):
+                post_sha256 = apply_fixes_to_csv(
+                    resolved_path,
+                    [proposal.fix for proposal in fixes],
+                )
+                append_applied_event(log_path, transaction.txn_id, post_sha256=post_sha256)
+        except Exception as exc:
+            _atomic_write_bytes(resolved_path, source_bytes)
+            if sha256_file(resolved_path) != transaction.source_sha256:
+                raise TransactionApplyError(
+                    "Apply failed and the source file could not be restored to original bytes."
+                ) from exc
+            raise
+    return transaction.txn_id
+def _build_retry_context(issue: Issue, attempts: list[RepairAttempt]) -> RetryContext:
+    """Build retry hints from previous failed attempts."""
+    rejected_values = frozenset(
+        attempt.fix.fix.new_value
+        for attempt in attempts
+        if attempt.fix is not None and attempt.status in {"denied", "rejected", "unknown"}
+    )
+    hints: list[str] = []
+    for attempt in attempts:
+        hints.append(attempt.reason)
+        hints.extend(attempt.unsat_core)
+    return RetryContext(
+        issue=issue,
+        previous_attempts=tuple(attempts),
+        rejected_values=rejected_values,
+        hints=tuple(hints),
+    )
+def propose_repairs(
+    issues: list[Issue],
+    path: Path,
+    working_df: TableLike,
+    schema: Schema | None,
+    *,
+    allow_llm: bool,
+    model: str,
+    allow_pii: bool,
+    confirm_pii: bool,
+    confirm_escalations: bool,
+    interactive: bool,
+    escalation_resolver: EscalationResolver | None = None,
+) -> tuple[list[ProposedFix], list[list[RepairAttempt]]]:
+    """Run repairers and gates issue-by-issue against a working dataframe."""
+    with repair_stage_span("dataforge.repair.repairers.build", allow_llm=allow_llm):
+        repairers = build_repairers(
+            cache_dir=cache_dir_for(path),
+            allow_llm=allow_llm,
+            model=model,
+        )
+    safety_filter = SafetyFilter()
+    verifier = SMTVerifier()
+    safety_context = SafetyContext(
+        allow_pii=allow_pii,
+        confirm_pii=confirm_pii,
+        confirm_escalations=confirm_escalations,
+    )
+    accepted_fixes: list[ProposedFix] = []
+    attempt_groups: list[list[RepairAttempt]] = []
+    for issue in issues:
+        attempts: list[RepairAttempt] = []
+        repairer = repairers.get(issue.issue_type)
+        if repairer is None:
+            attempts.append(
+                RepairAttempt(
+                    issue=issue,
+                    attempt_number=1,
+                    status="attempted_not_fixed",
+                    reason="No repairer is registered for this issue type.",
+                )
+            )
+            attempt_groups.append(attempts)
+            continue
+        accepted = False
+        retry_context = RetryContext(issue=issue)
+        for attempt_number in range(1, 4):
+            candidate = repairer.propose(issue, working_df, schema, retry_context=retry_context)
+            if candidate is None:
+                attempts.append(
+                    RepairAttempt(
+                        issue=issue,
+                        attempt_number=attempt_number,
+                        status="attempted_not_fixed",
+                        reason="No repair proposal was available for this issue.",
+                    )
+                )
+                break
+            preferred = safety_filter.choose_preferred([candidate], schema, safety_context)
+            safety_result = safety_filter.evaluate(preferred, schema, safety_context)
+            if (
+                safety_result.verdict == SafetyVerdict.ESCALATE
+                and interactive
+                and escalation_resolver is not None
+            ):
+                safety_context, safety_result = escalation_resolver(
+                    preferred,
+                    schema,
+                    safety_context,
+                    safety_filter,
+                    safety_result,
+                )
+            if safety_result.verdict == SafetyVerdict.DENY:
+                attempts.append(
+                    RepairAttempt(
+                        issue=issue,
+                        attempt_number=attempt_number,
+                        fix=preferred,
+                        status="denied",
+                        reason=safety_result.reason,
+                    )
+                )
+                retry_context = _build_retry_context(issue, attempts)
+                continue
+            if safety_result.verdict == SafetyVerdict.ESCALATE:
+                attempts.append(
+                    RepairAttempt(
+                        issue=issue,
+                        attempt_number=attempt_number,
+                        fix=preferred,
+                        status="escalated",
+                        reason=safety_result.reason,
+                    )
+                )
+                break
+            with repair_stage_span(
+                "dataforge.repair.verifier.verify",
+                issue_type=issue.issue_type,
+                row=issue.row,
+            ):
+                verifier_result = verifier.verify(working_df, [preferred], schema)
+            if verifier_result.verdict == VerificationVerdict.ACCEPT:
+                accepted_fixes.append(preferred)
+                set_cell_value(
+                    working_df,
+                    preferred.fix.row,
+                    preferred.fix.column,
+                    preferred.fix.new_value,
+                )
+                attempts.append(
+                    RepairAttempt(
+                        issue=issue,
+                        attempt_number=attempt_number,
+                        fix=preferred,
+                        status="accepted",
+                        reason=verifier_result.reason,
+                    )
+                )
+                accepted = True
+                break
+            attempts.append(
+                RepairAttempt(
+                    issue=issue,
+                    attempt_number=attempt_number,
+                    fix=preferred,
+                    status=(
+                        "rejected"
+                        if verifier_result.verdict == VerificationVerdict.REJECT
+                        else "unknown"
+                    ),
+                    reason=verifier_result.reason,
+                    unsat_core=verifier_result.unsat_core,
+                )
+            )
+            retry_context = _build_retry_context(issue, attempts)
+        if (
+            not accepted
+            and attempts
+            and attempts[-1].status not in {"attempted_not_fixed", "escalated"}
+        ):
+            last_reason = attempts[-1].reason
+            attempts[-1] = attempts[-1].model_copy(
+                update={
+                    "status": "attempted_not_fixed",
+                    "reason": (
+                        f"Issue was attempted but not fixed after {len(attempts)} attempt(s). "
+                        f"Last failure: {last_reason}"
+                    ),
+                }
+            )
+        attempt_groups.append(attempts)
+    return accepted_fixes, attempt_groups
+def _verified_fixes(
+    fixes: list[ProposedFix],
+    attempt_groups: list[list[RepairAttempt]],
+) -> list[VerifiedFix]:
+    """Build public verified fix payloads using accepted attempt reasons."""
+    accepted_reasons: dict[tuple[int, str, str], str] = {}
+    for attempts in attempt_groups:
+        for attempt in attempts:
+            if attempt.status == "accepted" and attempt.fix is not None:
+                fix = attempt.fix.fix
+                accepted_reasons[(fix.row, fix.column, fix.new_value)] = attempt.reason
+    return [
+        VerifiedFix(
+            **CandidateFix.from_proposed(fix).model_dump(),
+            verifier_reason=accepted_reasons.get(
+                (fix.fix.row, fix.fix.column, fix.fix.new_value),
+                "Accepted by verifier.",
+            ),
+        )
+        for fix in fixes
+    ]
+def _failed_attempts(attempt_groups: list[list[RepairAttempt]]) -> list[RepairFailure]:
+    """Return failures for issue groups whose final status was not accepted."""
+    return [
+        RepairFailure.from_attempts(attempts)
+        for attempts in attempt_groups
+        if attempts and attempts[-1].status != "accepted"
+    ]
+def run_repair_pipeline(request: RepairPipelineRequest) -> RepairPipelineResult:
+    """Run the public repair pipeline from detection through optional apply."""
+    source_path = request.source_path.resolve()
+    source_bytes = source_path.read_bytes()
+    df = read_csv(source_path)
+    with repair_stage_span("dataforge.repair.detect", row_count=row_count(df)):
+        issues = run_all_detectors(df, request.repair_schema)
+    with repair_stage_span("dataforge.repair.propose", issue_count=len(issues)):
+        accepted_fixes, attempt_groups = propose_repairs(
+            issues,
+            source_path,
+            copy_table(df),
+            request.repair_schema,
+            allow_llm=request.allow_llm,
+            model=request.model,
+            allow_pii=request.allow_pii,
+            confirm_pii=request.confirm_pii,
+            confirm_escalations=request.confirm_escalations,
+            interactive=request.interactive,
+        )
+    with repair_stage_span("dataforge.repair.safety.batch", fixes_count=len(accepted_fixes)):
+        batch_safety = SafetyFilter().evaluate_batch(accepted_fixes)
+    failures = _failed_attempts(attempt_groups)
+    transaction: RepairTransaction | None = None
+    txn_id: str | None = None
+    post_sha256: str | None = None
+    applied = False
+    reason = "No accepted fixes were produced."
+    if batch_safety.verdict != SafetyVerdict.ALLOW:
+        accepted_fixes = []
+        reason = batch_safety.reason
+    elif request.mode == "apply" and accepted_fixes:
+        txn_id = apply_transaction(source_path, accepted_fixes, source_bytes)
+        post_sha256 = sha256_file(source_path)
+        applied = True
+        reason = f"Applied {len(accepted_fixes)} fix(es)."
+    elif request.create_dry_run_transaction:
+        transaction, _log_path = create_repair_transaction(
+            source_path, accepted_fixes, source_bytes
+        )
+        txn_id = transaction.txn_id
+        reason = (
+            "Dry run completed without mutating the source file."
+            if accepted_fixes
+            else "No accepted fixes were produced."
+        )
+    elif accepted_fixes:
+        reason = "Dry run completed without mutating the source file."
+    if txn_id is not None and transaction is None:
+        # Replaying the log is unnecessary for the public contract here; this
+        # minimal receipt is intentionally enough for API callers.
+        transaction = None
+    receipt = RepairReceipt(
+        mode=request.mode,
+        applied=applied,
+        reversible=True,
+        source_path=str(source_path),
+        source_sha256=sha256_bytes(source_bytes),
+        post_sha256=post_sha256,
+        txn_id=txn_id,
+        allowed_columns=column_names(df),
+        valid_rows=list(range(row_count(df))),
+        issues_count=len(issues),
+        fixes_count=len(accepted_fixes),
+        reason=reason,
+    )
+    return RepairPipelineResult(
+        receipt=receipt,
+        issues=issues,
+        fixes=_verified_fixes(accepted_fixes, attempt_groups),
+        failures=failures,
+        transaction=transaction,
+    )

dataforge/env/__init__.py CHANGED Viewed

	@@ -1 +1,22 @@
1	- """~~Environment~~ ~~package~~ ~~scaffolding~~ ~~for~~ ~~DataForge~~.~~"""~~

+"""DataForge RL environment — OpenEnv-compatible data-quality environment.
+Public API:
+    DataForgeEnv     — Core environment with reset/step/state/close.
+    ResetResult      — Return type of reset().
+    StepResult       — Return type of step().
+    EnvState         — State snapshot from state().
+    DataForgeObservation — Agent-visible observation.
+    ToolResult       — Structured result from each action.
+"""
+from dataforge.env.environment import DataForgeEnv, EnvState, ResetResult, StepResult
+from dataforge.env.observation import DataForgeObservation, ToolResult
+__all__ = [
+    "DataForgeEnv",
+    "DataForgeObservation",
+    "EnvState",
+    "ResetResult",
+    "StepResult",
+    "ToolResult",
+]

dataforge/env/environment.py ADDED Viewed

	@@ -0,0 +1,884 @@

+"""OpenEnv-compatible DataForge RL environment.
+Core environment implementing reset/step/state/close for data-quality
+detection, diagnosis, and repair with typed tool-use actions.
+No LLM calls. No disk writes. Dataset state is in-memory per episode.
+"""
+from __future__ import annotations
+import logging
+import random
+import re
+import uuid
+from difflib import SequenceMatcher
+from pathlib import Path
+from typing import Any, cast
+import duckdb
+import pandas as pd
+import sqlglot
+import sqlglot.expressions as sqlglot_exp
+from pydantic import BaseModel, Field
+from dataforge.agent.scratchpad import Scratchpad
+from dataforge.agent.tool_actions import (
+    Action,
+    Diagnose,
+    Fix,
+    Hypothesis,
+    InspectRows,
+    PatternMatch,
+    RootCause,
+    SqlQuery,
+    StatTest,
+    parse_action,
+)
+from dataforge.detectors import run_all_detectors
+from dataforge.detectors.base import Issue
+from dataforge.env.observation import DataForgeObservation, ToolResult
+from dataforge.env.reward import (
+    P_FALSE_POS,
+    P_INVALID,
+    P_WRONG_FIX,
+    R_EXPLORE,
+    R_ROOT_CAUSE,
+    EpisodeMetrics,
+    RewardEngine,
+)
+logger = logging.getLogger("dataforge.env")
+__all__ = [
+    "DataForgeEnv",
+    "EnvState",
+    "ResetResult",
+    "StepResult",
+]
+_FIXTURES_DIR = Path(__file__).resolve().parents[1].parent / "fixtures"
+_DEFAULT_CSV = _FIXTURES_DIR / "hospital_10rows.csv"
+_DEFAULT_SCHEMA = _FIXTURES_DIR / "hospital_schema.yaml"
+_MAX_STEPS = 30
+_MAX_RESULT_ROWS = 20
+_TOOL_HISTORY_LIMIT = 5
+_NOISE_EPSILON = 0.15
+_BLOCKED_SQL_FRAGMENTS = (
+    "attach",
+    "call ",
+    "copy ",
+    "detach",
+    "duckdb_extensions",
+    "filename",
+    "from_csv_auto",
+    "glob(",
+    "http://",
+    "https://",
+    "httpfs",
+    "install",
+    "load ",
+    "mysql_scan",
+    "parquet_scan",
+    "postgres_scan",
+    "pragma",
+    "read_csv",
+    "read_json",
+    "read_parquet",
+    "s3://",
+    "sqlite_scan",
+)
+# ═══════════════════════════════════════════════════════════════════════════
+# Result models
+# ═══════════════════════════════════════════════════════════════════════════
+class ResetResult(BaseModel):
+    """Result of env.reset()."""
+    observation: DataForgeObservation
+    info: dict[str, Any] = Field(default_factory=dict)
+class StepResult(BaseModel):
+    """Result of env.step()."""
+    observation: DataForgeObservation
+    reward: float = 0.0
+    done: bool = False
+    info: dict[str, Any] = Field(default_factory=dict)
+class EnvState(BaseModel):
+    """Internal environment state snapshot."""
+    episode_id: str = ""
+    step_count: int = 0
+    task_id: str = ""
+    issues_detected: int = 0
+    issues_fixed: int = 0
+    false_positives: int = 0
+    total_issues: int = 0
+    is_done: bool = False
+# ═══════════════════════════════════════════════════════════════════════════
+# Environment
+# ═══════════════════════════════════════════════════════════════════════════
+class DataForgeEnv:
+    """OpenEnv-compatible RL environment for data quality repair.
+    Core API: ``reset()``, ``step()``, ``state()``, ``close()`` (no-op).
+    Example::
+        >>> env = DataForgeEnv()
+        >>> result = env.reset(seed=42)
+        >>> result.observation.done
+        False
+    """
+    def __init__(self, max_steps: int = _MAX_STEPS) -> None:
+        self._max_steps = max_steps
+        self._episode_id = ""
+        self._step_count = 0
+        self._df: pd.DataFrame = pd.DataFrame()
+        self._ground_truth: list[Issue] = []
+        self._found_issues: list[dict[str, Any]] = []
+        self._fixed_issues: list[dict[str, Any]] = []
+        self._false_positives = 0
+        self._cumulative_reward = 0.0
+        self._is_done = False
+        self._inspected_rows: set[int] = set()
+        self._noisy = False
+        self._noise_rng: random.Random | None = None
+        self._scratchpad = Scratchpad()
+        self._tool_history: list[ToolResult] = []
+        self._reward_engine = RewardEngine()
+        self._schema_info: dict[str, str] = {}
+        self._causal_dag_cache: Any = None
+        self._root_cause_labels: set[int] = set()
+    # ── Core API ──────────────────────────────────────────────────────────
+    def reset(self, seed: int | None = None, *, noisy: bool = False) -> ResetResult:
+        """Reset the environment for a new episode.
+        Args:
+            seed: Optional RNG seed for deterministic episodes.
+            noisy: If True, enable observation noise (epsilon=0.15).
+        Returns:
+            ResetResult with initial observation.
+        """
+        self._episode_id = str(uuid.uuid4())
+        self._step_count = 0
+        self._found_issues = []
+        self._fixed_issues = []
+        self._false_positives = 0
+        self._cumulative_reward = 0.0
+        self._is_done = False
+        self._inspected_rows = set()
+        self._scratchpad.reset()
+        self._tool_history = []
+        self._causal_dag_cache = None
+        self._root_cause_labels = set()
+        self._noisy = noisy
+        self._noise_rng = random.Random(seed if seed is not None else 0) if noisy else None
+        # Load fixture dataset
+        self._df = pd.read_csv(_DEFAULT_CSV, dtype=str)
+        self._schema_info = dict.fromkeys(self._df.columns, "str")
+        if _DEFAULT_SCHEMA.exists():
+            import yaml
+            with open(_DEFAULT_SCHEMA, encoding="utf-8") as f:
+                schema_data = yaml.safe_load(f)
+            if schema_data and "columns" in schema_data:
+                self._schema_info = schema_data["columns"]
+        # Run detectors for hidden ground truth
+        self._ground_truth = run_all_detectors(self._df)
+        logger.info(
+            "Episode %s: %d rows, %d ground-truth issues",
+            self._episode_id[:8],
+            len(self._df),
+            len(self._ground_truth),
+        )
+        # Initial observation with first 5 rows
+        initial_rows = cast(list[dict[str, Any]], self._df.head(5).to_dict(orient="records"))
+        obs = DataForgeObservation(
+            visible_rows=initial_rows,
+            step_budget_remaining=self._max_steps,
+            scratchpad_summary=self._scratchpad.summary(),
+            metadata={
+                "episode_id": self._episode_id,
+                "total_rows": len(self._df),
+                "total_columns": len(self._df.columns),
+                "schema": self._schema_info,
+            },
+        )
+        return ResetResult(observation=obs, info={"episode_id": self._episode_id})
+    def step(self, action: Action | dict[str, Any]) -> StepResult:
+        """Execute one agent action and return the result.
+        Args:
+            action: A typed Action model or raw dict to be parsed.
+        Returns:
+            StepResult with observation, reward, and done flag.
+        """
+        if self._is_done:
+            return self._terminal_result(0.0)
+        self._step_count += 1
+        # Parse if raw dict
+        if isinstance(action, dict):
+            try:
+                action = parse_action(action)
+            except Exception as exc:
+                return self._error_step(str(exc))
+        # Dispatch
+        try:
+            tool_result, reward = self._dispatch(action)
+        except Exception as exc:
+            logger.exception("Action dispatch error at step %d", self._step_count)
+            return self._error_step(str(exc))
+        # Late-step penalty
+        reward += self._reward_engine.compute_late_penalty(self._step_count, self._max_steps)
+        # Accumulate
+        self._cumulative_reward += reward
+        # Record in history
+        self._tool_history.append(tool_result)
+        if len(self._tool_history) > _TOOL_HISTORY_LIMIT:
+            self._tool_history = self._tool_history[-_TOOL_HISTORY_LIMIT:]
+        # Check termination
+        done = self._step_count >= self._max_steps
+        if done:
+            self._is_done = True
+            terminal = self._compute_terminal()
+            self._cumulative_reward = max(self._cumulative_reward, terminal)
+        obs = DataForgeObservation(
+            visible_rows=tool_result.data
+            if tool_result.action_type == "INSPECT_ROWS" and tool_result.success
+            else None,
+            scratchpad_summary=self._scratchpad.summary(),
+            step_budget_remaining=max(0, self._max_steps - self._step_count),
+            tool_usage_history=list(self._tool_history),
+            latest_result=tool_result,
+            done=done,
+            reward=reward,
+            cumulative_reward=self._cumulative_reward,
+        )
+        return StepResult(observation=obs, reward=reward, done=done)
+    def state(self) -> EnvState:
+        """Return current internal state snapshot."""
+        return EnvState(
+            episode_id=self._episode_id,
+            step_count=self._step_count,
+            issues_detected=len(self._found_issues),
+            issues_fixed=len(self._fixed_issues),
+            false_positives=self._false_positives,
+            total_issues=len(self._ground_truth),
+            is_done=self._is_done,
+        )
+    def close(self) -> None:
+        """No-op. Retained for OpenEnv container compatibility."""
+    # ── Dispatch ─────────────────��────────────────────────────────────────
+    def _dispatch(self, action: Action) -> tuple[ToolResult, float]:
+        """Route action to handler. Returns (tool_result, step_reward)."""
+        if isinstance(action, InspectRows):
+            return self._handle_inspect(action)
+        if isinstance(action, SqlQuery):
+            return self._handle_sql(action)
+        if isinstance(action, StatTest):
+            return self._handle_stat(action)
+        if isinstance(action, PatternMatch):
+            return self._handle_pattern(action)
+        if isinstance(action, Hypothesis):
+            return self._handle_hypothesis(action)
+        if isinstance(action, RootCause):
+            return self._handle_root_cause(action)
+        if isinstance(action, Diagnose):
+            return self._handle_diagnose(action)
+        if isinstance(action, Fix):
+            return self._handle_fix(action)
+        return ToolResult(
+            action_type="UNKNOWN",
+            success=False,
+            error={"verdict": "error", "reason": "Unknown action type"},
+        ), P_INVALID
+    # ── Action handlers ───────────────────────────────────────────────────
+    def _handle_inspect(self, action: InspectRows) -> tuple[ToolResult, float]:
+        """Handle INSPECT_ROWS: return dataset rows."""
+        valid_indices = [i for i in action.row_indices if 0 <= i < len(self._df)]
+        if not valid_indices:
+            return ToolResult(
+                action_type="INSPECT_ROWS",
+                success=False,
+                error={"verdict": "error", "reason": "No valid row indices"},
+            ), P_INVALID
+        # Apply 20-row cap
+        valid_indices = valid_indices[:20]
+        rows = self._df.iloc[valid_indices]
+        if action.column_names:
+            valid_cols = [c for c in action.column_names if c in self._df.columns]
+            if valid_cols:
+                rows = rows[valid_cols]
+        row_dicts = cast(list[dict[str, Any]], rows.to_dict(orient="records"))
+        for i, idx in enumerate(valid_indices[: len(row_dicts)]):
+            row_dicts[i]["_row_index"] = idx
+        # Noise injection
+        if self._noisy and self._noise_rng:
+            row_dicts = self._inject_noise(row_dicts)
+        # Exploration bonus
+        new_indices = set(valid_indices) - self._inspected_rows
+        self._inspected_rows.update(valid_indices)
+        gt_rows = {issue.row for issue in self._ground_truth}
+        found_rows = {f["row"] for f in self._found_issues}
+        bonus = self._reward_engine.compute_exploration_bonus(
+            new_indices,
+            self._inspected_rows,
+            len(self._df),
+            gt_rows,
+            found_rows,
+        )
+        return ToolResult(action_type="INSPECT_ROWS", success=True, data=row_dicts), bonus
+    def _handle_sql(self, action: SqlQuery) -> tuple[ToolResult, float]:
+        """Handle SQL_QUERY: execute read-only SQL via DuckDB."""
+        # Validate read-only
+        try:
+            parsed = [stmt for stmt in sqlglot.parse(action.query) if stmt is not None]
+        except sqlglot.errors.ParseError as exc:
+            return ToolResult(
+                action_type="SQL_QUERY",
+                success=False,
+                error={
+                    "verdict": "error",
+                    "reason": str(exc),
+                    "suggested_constraint": "Use valid SQL syntax",
+                },
+            ), P_INVALID
+        if len(parsed) != 1:
+            return ToolResult(
+                action_type="SQL_QUERY",
+                success=False,
+                error={
+                    "verdict": "rejected",
+                    "reason": "Exactly one SELECT statement is allowed.",
+                    "suggested_constraint": "Use a single read-only SELECT statement.",
+                },
+            ), P_INVALID
+        normalized_query = f" {action.query.lower()} "
+        blocked = next(
+            (fragment for fragment in _BLOCKED_SQL_FRAGMENTS if fragment in normalized_query),
+            None,
+        )
+        if blocked is not None:
+            return ToolResult(
+                action_type="SQL_QUERY",
+                success=False,
+                error={
+                    "verdict": "rejected",
+                    "reason": "SQL_QUERY may only read from the registered data relation.",
+                    "suggested_constraint": "Query the in-memory data table without file, network, extension, or table functions.",
+                },
+            ), P_INVALID
+        for stmt in parsed:
+            if stmt.key not in ("select",):
+                return ToolResult(
+                    action_type="SQL_QUERY",
+                    success=False,
+                    error={
+                        "verdict": "rejected",
+                        "reason": f"Only SELECT queries allowed, got {stmt.key.upper()}",
+                        "suggested_constraint": "Use SELECT statements only",
+                    },
+                ), P_INVALID
+            for table in stmt.find_all(sqlglot_exp.Table):
+                if table.name.lower() != "data":
+                    return ToolResult(
+                        action_type="SQL_QUERY",
+                        success=False,
+                        error={
+                            "verdict": "rejected",
+                            "reason": (
+                                "SQL_QUERY may only reference the registered data relation; "
+                                f"got '{table.name}'."
+                            ),
+                            "suggested_constraint": "Use FROM data for tabular queries.",
+                        },
+                    ), P_INVALID
+        try:
+            conn = duckdb.connect(":memory:")
+            conn.register("data", self._df)
+            result_df = conn.execute(action.query).fetchdf()
+            conn.close()
+            rows = result_df.head(_MAX_RESULT_ROWS).to_dict(orient="records")
+            return ToolResult(action_type="SQL_QUERY", success=True, data=rows), 0.0
+        except duckdb.Error as exc:
+            return ToolResult(
+                action_type="SQL_QUERY",
+                success=False,
+                error={"verdict": "error", "reason": str(exc)},
+            ), P_INVALID
+    def _handle_stat(self, action: StatTest) -> tuple[ToolResult, float]:
+        """Handle STAT_TEST: run zscore/iqr/ks on a column."""
+        if action.column not in self._df.columns:
+            return ToolResult(
+                action_type="STAT_TEST",
+                success=False,
+                error={"verdict": "error", "reason": f"Column '{action.column}' not found"},
+            ), P_INVALID
+        try:
+            col = pd.to_numeric(self._df[action.column], errors="coerce").dropna()
+            if len(col) == 0:
+                return ToolResult(
+                    action_type="STAT_TEST",
+                    success=False,
+                    error={
+                        "verdict": "error",
+                        "reason": f"No numeric values in column '{action.column}'",
+                    },
+                ), P_INVALID
+        except Exception as exc:
+            return ToolResult(
+                action_type="STAT_TEST",
+                success=False,
+                error={"verdict": "error", "reason": str(exc)},
+            ), P_INVALID
+        from scipy import stats as scipy_stats  # type: ignore[import-untyped]
+        if action.test_type == "zscore":
+            zscores = scipy_stats.zscore(col)
+            threshold = action.threshold or 3.0
+            outliers = col.index[abs(zscores) > threshold].tolist()
+            data = {
+                "test": "zscore",
+                "threshold": threshold,
+                "outlier_indices": outliers,
+                "n_outliers": len(outliers),
+                "mean": float(col.mean()),
+                "std": float(col.std()),
+            }
+        elif action.test_type == "iqr":
+            q1, q3 = float(col.quantile(0.25)), float(col.quantile(0.75))
+            iqr_val = q3 - q1
+            factor = action.threshold or 1.5
+            lower, upper = q1 - factor * iqr_val, q3 + factor * iqr_val
+            outliers = col.index[(col < lower) | (col > upper)].tolist()
+            data = {
+                "test": "iqr",
+                "q1": q1,
+                "q3": q3,
+                "iqr": iqr_val,
+                "lower": lower,
+                "upper": upper,
+                "outlier_indices": outliers,
+            }
+        elif action.test_type == "ks":
+            stat_val, p_val = scipy_stats.kstest(
+                col, "norm", args=(float(col.mean()), float(col.std()))
+            )
+            data = {
+                "test": "ks",
+                "statistic": float(stat_val),
+                "p_value": float(p_val),
+                "normal": p_val > 0.05,
+            }
+        else:
+            return ToolResult(
+                action_type="STAT_TEST",
+                success=False,
+                error={"verdict": "error", "reason": f"Unknown test type: {action.test_type}"},
+            ), P_INVALID
+        return ToolResult(action_type="STAT_TEST", success=True, data=data), 0.0
+    def _handle_pattern(self, action: PatternMatch) -> tuple[ToolResult, float]:
+        """Handle PATTERN_MATCH: evaluate regex against column values."""
+        if action.column not in self._df.columns:
+            return ToolResult(
+                action_type="PATTERN_MATCH",
+                success=False,
+                error={"verdict": "error", "reason": f"Column '{action.column}' not found"},
+            ), P_INVALID
+        try:
+            compiled = re.compile(action.pattern)
+        except re.error as exc:
+            return ToolResult(
+                action_type="PATTERN_MATCH",
+                success=False,
+                error={"verdict": "error", "reason": f"Invalid regex: {exc}"},
+            ), P_INVALID
+        matches: list[dict[str, Any]] = []
+        for idx, val in enumerate(self._df[action.column].astype(str)):
+            is_match = bool(compiled.search(val))
+            if is_match == action.expect_match:
+                matches.append({"row": idx, "column": action.column, "value": val})
+        return ToolResult(
+            action_type="PATTERN_MATCH",
+            success=True,
+            data={"matches": matches[:_MAX_RESULT_ROWS], "total_matches": len(matches)},
+        ), 0.0
+    def _handle_hypothesis(self, action: Hypothesis) -> tuple[ToolResult, float]:
+        """Handle HYPOTHESIS: record claim and award root-cause credit."""
+        self._scratchpad.add_hypothesis(
+            action.claim,
+            action.affected_rows,
+            action.affected_columns,
+            action.root_cause_type,
+        )
+        # Check for root-cause match against ground truth
+        credit = 0.0
+        for issue in self._ground_truth:
+            if (
+                issue.row in action.affected_rows
+                and issue.column in action.affected_columns
+                and issue.issue_type == action.root_cause_type
+            ):
+                credit += R_EXPLORE
+        data = {"recorded": True, "root_cause_credit": credit}
+        return ToolResult(action_type="HYPOTHESIS", success=True, data=data), credit
+    def _handle_root_cause(self, action: RootCause) -> tuple[ToolResult, float]:
+        """Handle ROOT_CAUSE: analyze detected issues for minimal roots."""
+        if not self._found_issues:
+            return ToolResult(
+                action_type="ROOT_CAUSE",
+                success=False,
+                error={"verdict": "error", "reason": "No detected issues are available"},
+            ), P_INVALID
+        invalid = [idx for idx in action.error_indices if idx >= len(self._found_issues)]
+        if invalid:
+            return ToolResult(
+                action_type="ROOT_CAUSE",
+                success=False,
+                error={
+                    "verdict": "error",
+                    "reason": f"Detected issue indices out of range: {invalid}",
+                },
+            ), P_INVALID
+        from dataforge.causal.pc import discover_causal_dag
+        from dataforge.causal.root_cause import CausalRootCauseAnalyzer, evidence_from_issue
+        if self._causal_dag_cache is None:
+            self._causal_dag_cache = discover_causal_dag(self._df).dag
+        selected = [
+            evidence_from_issue(index, self._found_issues[index]) for index in action.error_indices
+        ]
+        result = CausalRootCauseAnalyzer(self._causal_dag_cache).analyze(selected)
+        data = result.model_dump(mode="json")
+        reward = self._root_cause_reward(set(result.root_indices))
+        return ToolResult(action_type="ROOT_CAUSE", success=True, data=data), reward
+    def _handle_diagnose(self, action: Diagnose) -> tuple[ToolResult, float]:
+        """Handle DIAGNOSE: score against ground truth."""
+        if action.row < 0 or action.row >= len(self._df):
+            return ToolResult(
+                action_type="DIAGNOSE",
+                success=False,
+                error={"verdict": "error", "reason": f"Row {action.row} out of bounds"},
+            ), P_INVALID
+        if action.column not in self._df.columns:
+            return ToolResult(
+                action_type="DIAGNOSE",
+                success=False,
+                error={"verdict": "error", "reason": f"Column '{action.column}' not found"},
+            ), P_INVALID
+        # Already reported?
+        for found in self._found_issues:
+            if found["row"] == action.row and found["column"] == action.column:
+                return ToolResult(
+                    action_type="DIAGNOSE", success=True, data={"result": "already_found"}
+                ), 0.0
+        # Match ground truth
+        for issue in self._ground_truth:
+            if issue.row == action.row and issue.column == action.column:
+                type_match = action.issue_type == issue.issue_type
+                reward = self._reward_engine.diagnose_reward(type_match)
+                self._found_issues.append(
+                    {"row": action.row, "column": action.column, "type": action.issue_type}
+                )
+                self._scratchpad.confirm_issue(action.row, action.column, action.issue_type)
+                return ToolResult(
+                    action_type="DIAGNOSE",
+                    success=True,
+                    data={"result": "correct", "type_match": type_match},
+                ), reward
+        # False positive
+        self._false_positives += 1
+        return ToolResult(
+            action_type="DIAGNOSE", success=True, data={"result": "false_positive"}
+        ), P_FALSE_POS
+    def _root_cause_reward(self, root_indices: set[int]) -> float:
+        """Return root-cause bonus only when task labels are available."""
+        if not self._root_cause_labels:
+            return 0.0
+        return R_ROOT_CAUSE if root_indices == self._root_cause_labels else 0.0
+    def _handle_fix(self, action: Fix) -> tuple[ToolResult, float]:
+        """Handle FIX: validate through safety/SMT, then score."""
+        if action.row < 0 or action.row >= len(self._df):
+            return ToolResult(
+                action_type="FIX",
+                success=False,
+                error={"verdict": "error", "reason": f"Row {action.row} out of bounds"},
+            ), P_INVALID
+        if action.column not in self._df.columns:
+            return ToolResult(
+                action_type="FIX",
+                success=False,
+                error={"verdict": "error", "reason": f"Column '{action.column}' not found"},
+            ), P_INVALID
+        # Already fixed?
+        for fixed in self._fixed_issues:
+            if fixed["row"] == action.row and fixed["column"] == action.column:
+                return ToolResult(
+                    action_type="FIX", success=True, data={"result": "already_fixed"}
+                ), 0.0
+        # Safety filter + SMT verifier (best-effort, no crash on import failure)
+        try:
+            safety_ok, safety_msg = self._check_safety(action)
+        except Exception as exc:
+            logger.warning("Safety pipeline failed closed: %s", exc)
+            safety_ok = False
+            safety_msg = f"Safety pipeline failed closed: {exc}"
+        if not safety_ok:
+            return ToolResult(
+                action_type="FIX",
+                success=False,
+                error={"verdict": "rejected", "reason": safety_msg},
+            ), P_INVALID
+        # Match ground truth
+        for issue in self._ground_truth:
+            if issue.row == action.row and issue.column == action.column:
+                if issue.expected is None:
+                    return ToolResult(
+                        action_type="FIX", success=True, data={"result": "detection_only"}
+                    ), 0.0
+                # Exact match (case-insensitive)
+                if action.new_value.strip().lower() == str(issue.expected).lower():
+                    reward = self._reward_engine.fix_reward(
+                        exact=True, has_justification=bool(action.justification)
+                    )
+                    self._fixed_issues.append(
+                        {"row": action.row, "column": action.column, "value": action.new_value}
+                    )
+                    self._auto_diagnose(action, issue)
+                    return ToolResult(
+                        action_type="FIX", success=True, data={"result": "correct"}
+                    ), reward
+                # Partial: numeric within 1%
+                try:
+                    prov = float(action.new_value.strip())
+                    exp = float(str(issue.expected))
+                    rel_err = abs(prov - exp) / abs(exp) if exp != 0 else abs(prov)
+                    if rel_err < 0.01:
+                        reward = self._reward_engine.fix_reward(
+                            exact=False, has_justification=bool(action.justification)
+                        )
+                        self._fixed_issues.append(
+                            {"row": action.row, "column": action.column, "value": action.new_value}
+                        )
+                        self._auto_diagnose(action, issue)
+                        return ToolResult(
+                            action_type="FIX", success=True, data={"result": "partial_numeric"}
+                        ), reward
+                except (ValueError, TypeError):
+                    pass
+                # Partial: string similarity >= 85%
+                sim = SequenceMatcher(
+                    None, action.new_value.lower(), str(issue.expected).lower()
+                ).ratio()
+                if sim >= 0.85:
+                    reward = self._reward_engine.fix_reward(
+                        exact=False, has_justification=bool(action.justification)
+                    )
+                    self._fixed_issues.append(
+                        {"row": action.row, "column": action.column, "value": action.new_value}
+                    )
+                    self._auto_diagnose(action, issue)
+                    return ToolResult(
+                        action_type="FIX", success=True, data={"result": "partial_string"}
+                    ), reward
+                return ToolResult(
+                    action_type="FIX", success=True, data={"result": "wrong_value"}
+                ), P_WRONG_FIX
+        return ToolResult(
+            action_type="FIX", success=True, data={"result": "no_issue_at_location"}
+        ), P_WRONG_FIX
+    # ── Helpers ────────────────────────────────────────────────────────────
+    def _check_safety(self, action: Fix) -> tuple[bool, str]:
+        """Run SafetyFilter + SMTVerifier. Returns (ok, message)."""
+        try:
+            from dataforge.repairers.base import ProposedFix
+            from dataforge.safety.filter import SafetyContext, SafetyFilter, SafetyVerdict
+            from dataforge.transactions.txn import CellFix
+            from dataforge.verifier.smt import SMTVerifier, VerificationVerdict
+            old_val = str(self._df.at[action.row, action.column])
+            cell_fix = CellFix(
+                row=action.row,
+                column=action.column,
+                old_value=old_val,
+                new_value=action.new_value,
+                detector_id="agent",
+            )
+            proposed = ProposedFix(
+                fix=cell_fix,
+                reason=action.justification,
+                confidence=0.8,
+                provenance="deterministic",
+            )
+            sf = SafetyFilter()
+            ctx = SafetyContext()
+            sr = sf.evaluate(proposed, None, ctx)
+            if sr.verdict == SafetyVerdict.DENY:
+                return False, f"Safety filter denied: {sr.reason}"
+            verifier = SMTVerifier()
+            vr = verifier.verify(self._df, [proposed])
+            if vr.verdict == VerificationVerdict.REJECT:
+                return False, f"SMT verifier rejected: {vr.reason}"
+            if vr.verdict == VerificationVerdict.UNKNOWN:
+                return False, f"SMT verifier returned unknown: {vr.reason}"
+            return True, "Passed safety and verification"
+        except ImportError as exc:
+            return False, f"Safety/verifier dependency unavailable: {exc}"
+    def _auto_diagnose(self, action: Fix, issue: Issue) -> None:
+        """Auto-credit diagnosis when agent fixes without diagnosing first."""
+        already = any(
+            f["row"] == action.row and f["column"] == action.column for f in self._found_issues
+        )
+        if not already:
+            self._found_issues.append(
+                {"row": action.row, "column": action.column, "type": issue.issue_type}
+            )
+    def _inject_noise(self, rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        """Apply deterministic observation noise (epsilon=0.15)."""
+        if not self._noise_rng:
+            return rows
+        noisy = []
+        for row in rows:
+            row_copy = dict(row)
+            if self._noise_rng.random() < _NOISE_EPSILON:
+                cols = [k for k in row_copy if k != "_row_index"]
+                if cols:
+                    col = self._noise_rng.choice(cols)
+                    val = row_copy[col]
+                    if isinstance(val, str) and len(val) > 3:
+                        row_copy[col] = (
+                            val[: -(self._noise_rng.randint(1, 3))]
+                            if self._noise_rng.random() < 0.5
+                            else val.swapcase()
+                        )
+            noisy.append(row_copy)
+        return noisy
+    def _compute_terminal(self) -> float:
+        """Compute terminal score."""
+        fixable = [i for i in self._ground_truth if i.expected is not None]
+        metrics = EpisodeMetrics(
+            found_issues=len(self._found_issues),
+            total_issues=len(self._ground_truth),
+            fixed_issues=len(self._fixed_issues),
+            fixable_issues=len(fixable),
+            false_positives=self._false_positives,
+        )
+        return self._reward_engine.compute_terminal_score(metrics)
+    def _error_step(self, message: str) -> StepResult:
+        """Build error StepResult."""
+        tr = ToolResult(
+            action_type="ERROR", success=False, error={"verdict": "error", "reason": message}
+        )
+        self._tool_history.append(tr)
+        self._cumulative_reward += P_INVALID
+        done = self._step_count >= self._max_steps
+        if done:
+            self._is_done = True
+        return StepResult(
+            observation=DataForgeObservation(
+                step_budget_remaining=max(0, self._max_steps - self._step_count),
+                tool_usage_history=list(self._tool_history[-_TOOL_HISTORY_LIMIT:]),
+                latest_result=tr,
+                done=done,
+                reward=P_INVALID,
+                cumulative_reward=self._cumulative_reward,
+                scratchpad_summary=self._scratchpad.summary(),
+            ),
+            reward=P_INVALID,
+            done=done,
+        )
+    def _terminal_result(self, reward: float) -> StepResult:
+        """Build terminal StepResult for already-done episodes."""
+        return StepResult(
+            observation=DataForgeObservation(
+                step_budget_remaining=0,
+                done=True,
+                reward=reward,
+                cumulative_reward=self._cumulative_reward,
+                scratchpad_summary=self._scratchpad.summary(),
+                tool_usage_history=list(self._tool_history[-_TOOL_HISTORY_LIMIT:]),
+            ),
+            reward=reward,
+            done=True,
+        )

dataforge/env/observation.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""Observation builder for the DataForge RL environment.
+Constructs agent-visible observations containing partial data views,
+scratchpad summaries, tool results, and step budget information.
+"""
+from __future__ import annotations
+from typing import Any
+from pydantic import BaseModel, Field
+__all__ = ["DataForgeObservation", "ToolResult"]
+class ToolResult(BaseModel):
+    """Result of a single tool-use action.
+    Args:
+        action_type: The action type that produced this result.
+        success: Whether the action succeeded.
+        data: Action-specific result data (rows, stats, matches, etc.).
+        error: Structured error information if the action failed.
+    """
+    action_type: str
+    success: bool = True
+    data: Any = None
+    error: dict[str, Any] | None = None
+    model_config = {"frozen": True}
+class DataForgeObservation(BaseModel):
+    """Agent-visible observation returned after each environment step.
+    Args:
+        visible_rows: Dataset rows returned by INSPECT_ROWS or reset.
+        detector_hints: Optional hints from detectors (partial ground truth).
+        scratchpad_summary: Compact summary of the agent's scratchpad.
+        step_budget_remaining: Steps left before auto-finalize.
+        tool_usage_history: Last 5 tool results for context.
+        latest_result: Result of the most recent action.
+        done: Whether the episode has ended.
+        reward: Step reward.
+        cumulative_reward: Running total reward for the episode.
+        metadata: Additional key-value metadata.
+    """
+    visible_rows: list[dict[str, Any]] | None = None
+    detector_hints: list[str] | None = None
+    scratchpad_summary: str = ""
+    step_budget_remaining: int = 0
+    tool_usage_history: list[ToolResult] = Field(default_factory=list)
+    latest_result: ToolResult | None = None
+    done: bool = False
+    reward: float = 0.0
+    cumulative_reward: float = 0.0
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    model_config = {"frozen": True}

dataforge/env/openenv_core.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""OpenEnv-core adapter for the DataForge RL environment."""
+from __future__ import annotations
+from typing import Any
+from pydantic import Field
+from dataforge.env.environment import DataForgeEnv
+try:
+    from openenv.core.env_server import (
+        Action as OpenEnvAction,
+    )
+    from openenv.core.env_server import (
+        Environment as OpenEnvEnvironment,
+    )
+    from openenv.core.env_server import (
+        Observation as OpenEnvObservation,
+    )
+    from openenv.core.env_server import (
+        create_app,
+    )
+except ImportError as exc:  # pragma: no cover - exercised only without openenv extra
+    raise RuntimeError(
+        "The OpenEnv adapter requires the openenv extra: "
+        "pip install 'dataforge15[openenv]'."
+    ) from exc
+class DataForgeOpenEnvAction(OpenEnvAction):
+    """OpenEnv action wrapper for DataForge's typed action payloads."""
+    action_type: str = Field(min_length=1)
+    row_indices: list[int] | None = None
+    column_names: list[str] | None = None
+    query: str | None = None
+    sql: str | None = None
+    test_type: str | None = None
+    test: str | None = None
+    column: str | None = None
+    threshold: float | None = None
+    pattern: str | None = None
+    regex: str | None = None
+    expect_match: bool | None = None
+    claim: str | None = None
+    affected_rows: list[int] | None = None
+    affected_columns: list[str] | None = None
+    root_cause_type: str | None = None
+    error_indices: list[int] | None = None
+    row: int | None = None
+    issue_type: str | None = None
+    new_value: str | None = None
+    proposed_value: str | None = None
+    justification: str | None = None
+    fix_type: str | None = None
+    def as_dataforge_payload(self) -> dict[str, Any]:
+        """Return the action payload expected by ``DataForgeEnv.step``."""
+        payload = self.model_dump(exclude_none=True)
+        payload.pop("metadata", None)
+        return payload
+class DataForgeOpenEnvObservation(OpenEnvObservation):
+    """OpenEnv observation model mirroring DataForge's native observation."""
+    visible_rows: list[dict[str, Any]] | None = None
+    detector_hints: list[str] | None = None
+    scratchpad_summary: str = ""
+    step_budget_remaining: int = 0
+    tool_usage_history: list[dict[str, Any]] = Field(default_factory=list)
+    latest_result: dict[str, Any] | None = None
+    cumulative_reward: float = 0.0
+def _to_openenv_observation(payload: dict[str, Any]) -> DataForgeOpenEnvObservation:
+    """Convert a native DataForge observation dictionary into OpenEnv shape."""
+    return DataForgeOpenEnvObservation(
+        visible_rows=payload.get("visible_rows"),
+        detector_hints=payload.get("detector_hints"),
+        scratchpad_summary=str(payload.get("scratchpad_summary", "")),
+        step_budget_remaining=int(payload.get("step_budget_remaining", 0)),
+        tool_usage_history=list(payload.get("tool_usage_history") or []),
+        latest_result=payload.get("latest_result"),
+        done=bool(payload.get("done", False)),
+        reward=payload.get("reward"),
+        cumulative_reward=float(payload.get("cumulative_reward", 0.0)),
+        metadata=dict(payload.get("metadata") or {}),
+    )
+class DataForgeOpenEnv(OpenEnvEnvironment):
+    """OpenEnv-native environment wrapper."""
+    SUPPORTS_CONCURRENT_SESSIONS = True
+    def __init__(self) -> None:
+        super().__init__()
+        self._env = DataForgeEnv()
+        self._last_observation: DataForgeOpenEnvObservation | None = None
+    def reset(
+        self,
+        seed: int | None = None,
+        episode_id: str | None = None,
+        **kwargs: Any,
+    ) -> DataForgeOpenEnvObservation:
+        """Reset the wrapped DataForge environment."""
+        del episode_id, kwargs
+        result = self._env.reset(seed=seed)
+        observation = _to_openenv_observation(result.observation.model_dump(mode="json"))
+        self._last_observation = observation
+        return observation
+    def step(
+        self,
+        action: DataForgeOpenEnvAction,
+        timeout_s: float | None = None,
+        **kwargs: Any,
+    ) -> DataForgeOpenEnvObservation:
+        """Step the wrapped DataForge environment."""
+        del timeout_s, kwargs
+        result = self._env.step(action.as_dataforge_payload())
+        observation = _to_openenv_observation(result.observation.model_dump(mode="json"))
+        self._last_observation = observation
+        return observation
+    def state(self) -> DataForgeOpenEnvObservation:
+        """Return the latest observation or reset lazily."""
+        if self._last_observation is None:
+            return self.reset()
+        return self._last_observation
+    def close(self) -> None:
+        """Close the wrapped environment."""
+        self._env.close()
+app = create_app(
+    DataForgeOpenEnv,
+    DataForgeOpenEnvAction,
+    DataForgeOpenEnvObservation,
+    env_name="dataforge-env",
+    max_concurrent_envs=64,
+)

dataforge/env/reward.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""Reward engine for the DataForge RL environment.
+All constants and formulas are derived bit-for-bit from REWARD_DESIGN.md.
+Terminal score: detection_rate * 0.40 + fix_rate * 0.60 - false_positives * fp_rate
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+__all__ = [
+    "DETECTION_WEIGHT",
+    "FALSE_POS_PENALTY_RATE",
+    "FIX_WEIGHT",
+    "LATE_STEP_THRESHOLD",
+    "P_FALSE_POS",
+    "P_INVALID",
+    "P_LATE_STEP",
+    "P_REINSPECT",
+    "P_WRONG_FIX",
+    "R_DIAGNOSE",
+    "R_EXPLORE",
+    "R_FIX",
+    "R_FIX_PARTIAL",
+    "R_JUSTIFY_BONUS",
+    "R_ROOT_CAUSE",
+    "R_TYPE_BONUS",
+    "SPAM_THRESHOLD",
+    "EpisodeMetrics",
+    "RewardEngine",
+]
+# Positive rewards
+R_DIAGNOSE: float = 0.10
+R_TYPE_BONUS: float = 0.05
+R_FIX: float = 0.15
+R_FIX_PARTIAL: float = 0.075
+R_JUSTIFY_BONUS: float = 0.05
+R_EXPLORE: float = 0.01
+R_ROOT_CAUSE: float = 0.10
+# Negative penalties
+P_FALSE_POS: float = -0.05
+P_WRONG_FIX: float = -0.08
+P_LATE_STEP: float = -0.02
+P_INVALID: float = -0.01
+P_REINSPECT: float = -0.01
+# Thresholds
+LATE_STEP_THRESHOLD: float = 0.80
+DETECTION_WEIGHT: float = 0.40
+FIX_WEIGHT: float = 0.60
+FALSE_POS_PENALTY_RATE: float = 0.05
+SPAM_THRESHOLD: float = 2.0
+@dataclass
+class EpisodeMetrics:
+    """Accumulated metrics for terminal score computation."""
+    found_issues: int = 0
+    total_issues: int = 0
+    fixed_issues: int = 0
+    fixable_issues: int = 0
+    false_positives: int = 0
+    @property
+    def total_diagnoses(self) -> int:
+        """Total diagnosis attempts (correct + incorrect)."""
+        return self.found_issues + self.false_positives
+class RewardEngine:
+    """Computes dense per-step and terminal rewards."""
+    def compute_terminal_score(self, metrics: EpisodeMetrics) -> float:
+        """Compute terminal score per REWARD_DESIGN.md formula."""
+        if metrics.total_issues == 0:
+            return 0.0
+        detection_rate = metrics.found_issues / metrics.total_issues
+        fix_rate = (
+            metrics.fixed_issues / metrics.fixable_issues if metrics.fixable_issues > 0 else 0.0
+        )
+        fp_rate = FALSE_POS_PENALTY_RATE
+        if (
+            metrics.total_issues > 0
+            and metrics.total_diagnoses > SPAM_THRESHOLD * metrics.total_issues
+        ):
+            fp_rate *= 2.0
+        penalty = metrics.false_positives * fp_rate
+        raw = detection_rate * DETECTION_WEIGHT + fix_rate * FIX_WEIGHT - penalty
+        return round(max(0.0, min(1.0, raw)), 4)
+    def compute_late_penalty(self, step: int, max_steps: int) -> float:
+        """Return P_LATE_STEP if past 80% budget, else 0.0."""
+        threshold = int(max_steps * LATE_STEP_THRESHOLD)
+        return P_LATE_STEP if step > threshold else 0.0
+    def compute_exploration_bonus(
+        self,
+        new_row_indices: set[int],
+        inspected_rows: set[int],
+        total_rows: int,
+        ground_truth_rows: set[int],
+        found_issue_rows: set[int],
+    ) -> float:
+        """Compute exploration bonus for newly-inspected rows."""
+        if not new_row_indices:
+            return P_REINSPECT
+        undiscovered = sum(
+            1 for r in new_row_indices if r in ground_truth_rows and r not in found_issue_rows
+        )
+        bonus = undiscovered * R_EXPLORE
+        if total_rows > 0:
+            all_inspected = inspected_rows | new_row_indices
+            coverage_ratio = len(all_inspected) / total_rows
+            bonus += len(new_row_indices) * R_EXPLORE * 0.5 * (1.0 - coverage_ratio)
+        return bonus
+    def diagnose_reward(self, type_match: bool) -> float:
+        """Reward for correct diagnosis."""
+        return R_DIAGNOSE + (R_TYPE_BONUS if type_match else 0.0)
+    def fix_reward(self, exact: bool, has_justification: bool) -> float:
+        """Reward for correct fix."""
+        reward = R_FIX if exact else R_FIX_PARTIAL
+        return reward + (R_JUSTIFY_BONUS if has_justification else 0.0)

dataforge/env/server.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""FastAPI server for the DataForge RL environment.
+Provides OpenEnv-compatible HTTP endpoints:
+    POST /reset    — Start a new episode
+    POST /step     — Execute an action
+    GET  /state    — Return current state snapshot
+    POST /close    — No-op shutdown
+    GET  /health   — Liveness check
+    GET  /metadata — Environment metadata
+    GET  /schema   — Action/observation JSON schemas
+"""
+from __future__ import annotations
+import logging
+import os
+from threading import RLock
+from typing import Any
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import TypeAdapter
+from dataforge.agent.tool_actions import Action
+from dataforge.env.environment import DataForgeEnv, EnvState
+from dataforge.env.observation import DataForgeObservation
+from dataforge.http.problem import problem_exception_handler
+from dataforge.observability import configure_fastapi_observability
+logger = logging.getLogger("dataforge.env.server")
+def _build_cors_origins() -> list[str]:
+    """Build the explicit OpenEnv CORS allowlist from the environment."""
+    raw_origins = os.environ.get("DATAFORGE_OPENENV_ORIGINS", "")
+    return [origin.strip() for origin in raw_origins.split(",") if origin.strip()]
+def _build_cors_origin_regex() -> str | None:
+    """Allow local browser development only when explicitly enabled."""
+    if os.environ.get("DATAFORGE_OPENENV_DEV") != "1":
+        return None
+    return r"^http://(?:localhost|127(?:\.\d{1,3}){3})(?::\d+)?$"
+app = FastAPI(
+    title="DataForge Environment",
+    description="OpenEnv-compatible RL environment for data-quality repair.",
+    version="0.1.0",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=_build_cors_origins(),
+    allow_origin_regex=_build_cors_origin_regex(),
+    allow_credentials=False,
+    allow_methods=["GET", "POST", "OPTIONS"],
+    allow_headers=["*"],
+)
+app.add_exception_handler(HTTPException, problem_exception_handler)
+configure_fastapi_observability(app, service_name="dataforge-openenv")
+_registry_lock = RLock()
+_default_env = DataForgeEnv()
+_sessions: dict[str, DataForgeEnv] = {}
+def _get_env(episode_id: str | None) -> DataForgeEnv:
+    """Resolve an environment by episode id, preserving legacy no-id behavior."""
+    if not episode_id:
+        return _default_env
+    with _registry_lock:
+        try:
+            return _sessions[episode_id]
+        except KeyError as exc:
+            raise HTTPException(
+                status_code=404,
+                detail={"error": "episode_not_found", "episode_id": episode_id},
+            ) from exc
+def _remember_env(env: DataForgeEnv, episode_id: str) -> None:
+    """Register a session and update the legacy default environment."""
+    global _default_env
+    with _registry_lock:
+        _sessions[episode_id] = env
+        _default_env = env
+@app.post("/reset")
+async def reset(seed: int | None = None) -> dict[str, Any]:
+    """Reset the environment for a new episode."""
+    env = DataForgeEnv()
+    result = env.reset(seed=seed)
+    episode_id = str(result.info["episode_id"])
+    _remember_env(env, episode_id)
+    return result.model_dump(mode="json")
+@app.post("/step")
+async def step(action: dict[str, Any]) -> dict[str, Any]:
+    """Execute one agent action."""
+    action_payload = dict(action)
+    raw_episode_id = action_payload.pop("episode_id", None)
+    episode_id = str(raw_episode_id) if raw_episode_id else None
+    result = _get_env(episode_id).step(action_payload)
+    return result.model_dump(mode="json")
+@app.get("/state")
+async def state(episode_id: str | None = None) -> dict[str, Any]:
+    """Return current environment state snapshot."""
+    result = _get_env(episode_id).state()
+    return result.model_dump(mode="json")
+@app.post("/close")
+async def close(request: Request, episode_id: str | None = None) -> dict[str, Any]:
+    """No-op close endpoint for OpenEnv compatibility."""
+    body_episode_id: str | None = None
+    if episode_id is None:
+        try:
+            payload = await request.json()
+        except Exception:
+            payload = None
+        if isinstance(payload, dict) and payload.get("episode_id"):
+            body_episode_id = str(payload["episode_id"])
+    target_episode_id = episode_id or body_episode_id
+    env = _get_env(target_episode_id)
+    env.close()
+    if target_episode_id:
+        with _registry_lock:
+            _sessions.pop(target_episode_id, None)
+    return {"status": "closed", "episode_id": target_episode_id}
+@app.get("/health")
+async def health() -> dict[str, Any]:
+    """Liveness check."""
+    return {"status": "healthy", "environment": "dataforge-env"}
+@app.get("/metadata")
+async def metadata() -> dict[str, Any]:
+    """Environment metadata for OpenEnv discovery."""
+    return {
+        "name": "dataforge-env",
+        "version": "0.1.0",
+        "description": (
+            "DataForge RL Environment — agents learn to detect, diagnose, "
+            "and repair data-quality issues in tabular datasets."
+        ),
+        "action_types": [
+            "INSPECT_ROWS",
+            "SQL_QUERY",
+            "STAT_TEST",
+            "PATTERN_MATCH",
+            "HYPOTHESIS",
+            "ROOT_CAUSE",
+            "DIAGNOSE",
+            "FIX",
+        ],
+    }
+@app.get("/schema")
+async def schema() -> dict[str, Any]:
+    """Return JSON schemas for action and observation models."""
+    action_adapter: TypeAdapter[Action] = TypeAdapter(Action)
+    return {
+        "action": action_adapter.json_schema(),
+        "observation": DataForgeObservation.model_json_schema(),
+        "state": EnvState.model_json_schema(),
+    }

dataforge/evaluation_contract.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Public evaluation evidence models for DataForge repair releases."""
+from __future__ import annotations
+import hashlib
+import json
+from typing import Any, Literal
+from pydantic import BaseModel, Field
+InferabilityLabel = Literal[
+    "deterministic_normalization",
+    "context_derivable",
+    "external_reference_required",
+    "not_inferable_from_prompt",
+]
+PROMOTION_SLICE: InferabilityLabel = "deterministic_normalization"
+ABSTENTION_SLICES = frozenset({"external_reference_required", "not_inferable_from_prompt"})
+AUXILIARY_SLICES = frozenset({"context_derivable"})
+PromotionStatus = Literal[
+    "diagnostic_only",
+    "diagnostic_promoted",
+    "quality_improved_verified",
+    "public_quality_milestone",
+    "rejected",
+]
+class EvaluationTaskV2(BaseModel):
+    """One auditable, source-stable model grading task.
+    Ground truth is retained for local grading but excluded from normal JSON
+    serialization so prompts and public reports cannot accidentally leak labels.
+    """
+    schema_version: Literal["evaluation_task_v2"] = "evaluation_task_v2"
+    task_id: str = Field(min_length=1)
+    prompt_hash: str = Field(min_length=64, max_length=64)
+    dataset_sha: str = Field(min_length=1)
+    split_id: str = Field(min_length=1)
+    inferability: InferabilityLabel
+    prompt: dict[str, Any]
+    allowed_columns: list[str] = Field(min_length=1)
+    valid_rows: list[int] = Field(min_length=1)
+    provenance: dict[str, Any]
+    hidden_ground_truth: list[dict[str, Any]] = Field(default_factory=list, exclude=True)
+    model_config = {"frozen": True}
+class ReleaseEvidenceV2(BaseModel):
+    """Serializable release-gate evidence for model and benchmark promotion."""
+    schema_version: Literal["release_evidence_v2"] = "release_evidence_v2"
+    model_repo: str = Field(min_length=1)
+    model_sha: str = Field(min_length=1)
+    dataset_repo: str = Field(min_length=1)
+    dataset_sha: str = Field(min_length=1)
+    strict_macro_f1: float = Field(ge=0.0, le=1.0)
+    canonicalized_macro_f1: float = Field(ge=0.0, le=1.0)
+    parse_success_rate: float = Field(ge=0.0, le=1.0)
+    schema_case_error_count: int = Field(ge=0)
+    promotion_slice: InferabilityLabel = PROMOTION_SLICE
+    slice_scores: dict[InferabilityLabel, dict[str, float | int]] = Field(default_factory=dict)
+    inferability_slice_scores: dict[InferabilityLabel, float] = Field(default_factory=dict)
+    package_versions: dict[str, str] = Field(default_factory=dict)
+    promotion_status: PromotionStatus
+    gate_failures: list[str] = Field(default_factory=list)
+    model_config = {"frozen": True}
+def prompt_sha256(prompt: dict[str, Any]) -> str:
+    """Hash a prompt payload with stable JSON serialization."""
+    encoded = json.dumps(prompt, sort_keys=True, separators=(",", ":")).encode("utf-8")
+    return hashlib.sha256(encoded).hexdigest()

dataforge/fixtures/hospital_10rows.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+provider_number,hospital_name,city,state,zip_code,phone_number,rating,mortality_rate,readmission_rate,er_wait_time
+PRV001,General Hospital,Springfield,IL,62701,2175550101,4.2,0.023,0.145,28
+PRV002,St. Mary Medical Center,Chicago,IL,60601,3125550202,3.8,0.031,0.162,35
+PRV001,Springfield Medical,Springfield,IL,62701,2175550303,4.5,0.019,0.138,22
+PRV003,Mercy Hospital,Peoria,IL,61602,3095550404,3.5,0.028,0.158,31
+PRV004,Northwestern Memorial,Chicago,IL,60611,not available,4.1,0.025,0.149,26
+PRV005,Rush University MC,Chicago,IL,60612,3125550606,45.0,0.022,0.141,29
+PRV006,Advocate Christ,Oak Lawn,IL,60453,7085550707,3.9,0.027,0.155,33
+PRV007,Loyola University MC,Maywood,IL,60153,7085550808,4.3,0.020,0.142,25
+PRV008,Presence St. Joseph,Joliet,IL,60435,8155550909,4.0,0.026,0.151,30
+PRV009,Edward Hospital,Naperville,IL,60540,6305551010,3.7,0.029,0.160,34

dataforge/fixtures/hospital_schema.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+# Hospital dataset schema for DataForge profile command.
+columns:
+  provider_number: str
+  hospital_name: str
+  city: str
+  state: str
+  zip_code: str
+  phone_number: str
+  rating: float
+  mortality_rate: float
+  readmission_rate: float
+  er_wait_time: int
+functional_dependencies:
+  - determinant: [provider_number]
+    dependent: hospital_name

dataforge/http/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """HTTP helpers shared by DataForge backend surfaces."""

dataforge/http/problem.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""RFC 9457 problem details helpers for FastAPI surfaces."""
+from __future__ import annotations
+from collections.abc import Mapping
+from typing import Any
+from fastapi import HTTPException, Request
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, ConfigDict, Field
+class ProblemDetail(BaseModel):
+    """RFC 9457 problem detail response with extension members."""
+    type: str = Field(default="about:blank")
+    title: str
+    status: int
+    detail: str
+    instance: str | None = None
+    model_config = ConfigDict(strict=True, extra="allow")
+def problem_body(
+    *,
+    status: int,
+    title: str,
+    detail: str,
+    type_: str = "about:blank",
+    instance: str | None = None,
+    **extensions: Any,
+) -> dict[str, Any]:
+    """Build a problem details JSON object."""
+    body = ProblemDetail(
+        type=type_,
+        title=title,
+        status=status,
+        detail=detail,
+        instance=instance,
+        **extensions,
+    )
+    return body.model_dump(mode="json", exclude_none=True)
+def problem_response(
+    *,
+    status: int,
+    title: str,
+    detail: str,
+    type_: str = "about:blank",
+    instance: str | None = None,
+    headers: Mapping[str, str] | None = None,
+    **extensions: Any,
+) -> JSONResponse:
+    """Return an RFC 9457 JSON response."""
+    return JSONResponse(
+        status_code=status,
+        content=problem_body(
+            status=status,
+            title=title,
+            detail=detail,
+            type_=type_,
+            instance=instance,
+            **extensions,
+        ),
+        headers=headers,
+        media_type="application/problem+json",
+    )
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Normalize FastAPI HTTPException values into problem details."""
+    raw_detail = exc.detail
+    extensions: dict[str, Any] = {}
+    if isinstance(raw_detail, dict):
+        error_code = str(raw_detail.get("error", "http_error"))
+        message = str(raw_detail.get("message") or raw_detail.get("detail") or error_code)
+        extensions.update(raw_detail)
+    else:
+        error_code = "http_error"
+        message = str(raw_detail)
+    return problem_response(
+        status=exc.status_code,
+        type_=f"https://dataforge.local/problems/{error_code}",
+        title=error_code.replace("_", " ").title(),
+        detail=message,
+        instance=str(request.url.path),
+        headers=exc.headers,
+        **extensions,
+    )
+async def problem_exception_handler(request: Request, exc: Exception) -> JSONResponse:
+    """Adapter with the broad exception signature Starlette expects."""
+    if isinstance(exc, HTTPException):
+        return await http_exception_handler(request, exc)
+    raise exc

dataforge/integrations/dbt.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """The dbt integration lives in the separate ``dataforge15-dbt`` package."""

dataforge/observability.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Optional OpenTelemetry hooks for DataForge backend surfaces."""
+from __future__ import annotations
+import os
+from collections.abc import Iterator
+from contextlib import contextmanager, nullcontext
+from importlib import import_module
+from typing import Any
+_SENSITIVE_ATTR_FRAGMENTS = ("authorization", "cookie", "token", "key", "secret", "password")
+def _otel_enabled() -> bool:
+    """Return whether optional OpenTelemetry instrumentation is enabled."""
+    return os.environ.get("DATAFORGE_OTEL_ENABLED", "").strip().lower() in {
+        "1",
+        "true",
+        "yes",
+        "on",
+    }
+def _safe_attrs(attributes: dict[str, Any]) -> dict[str, str | int | float | bool]:
+    """Keep only scalar, non-sensitive telemetry attributes."""
+    safe: dict[str, str | int | float | bool] = {}
+    for key, value in attributes.items():
+        lowered = key.lower()
+        if any(fragment in lowered for fragment in _SENSITIVE_ATTR_FRAGMENTS):
+            continue
+        if lowered in {"row_values", "rows", "payload", "source_bytes", "csv"}:
+            continue
+        if isinstance(value, str | int | float | bool):
+            safe[key] = value
+    return safe
+def configure_fastapi_observability(app: Any, *, service_name: str) -> bool:
+    """Instrument a FastAPI app when OpenTelemetry is explicitly enabled."""
+    if not _otel_enabled():
+        return False
+    try:
+        fastapi_instrumentation = import_module("opentelemetry.instrumentation.fastapi")
+        trace_module = import_module("opentelemetry.trace")
+    except ImportError:
+        return False
+    app.state.dataforge_service_name = service_name
+    fastapi_instrumentation.FastAPIInstrumentor.instrument_app(
+        app,
+        tracer_provider=trace_module.get_tracer_provider(),
+        excluded_urls="/api/docs,/docs,/redoc,/openapi.json",
+    )
+    return True
+@contextmanager
+def repair_stage_span(stage: str, **attributes: Any) -> Iterator[None]:
+    """Create a repair-stage span when OpenTelemetry is available."""
+    if not _otel_enabled():
+        with nullcontext():
+            yield
+        return
+    try:
+        trace_module = import_module("opentelemetry.trace")
+    except ImportError:
+        with nullcontext():
+            yield
+        return
+    tracer = trace_module.get_tracer("dataforge.repair")
+    with tracer.start_as_current_span(stage) as span:
+        for key, value in _safe_attrs(attributes).items():
+            span.set_attribute(key, value)
+        yield

dataforge/py.typed ADDED Viewed

	@@ -0,0 +1 @@


1	+

dataforge/release/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Release verification helpers for DataForge."""
2	+