[project]
name = "dataforge15"
version = "0.1.0rc1"
description = "DataForge15: CLI-first data-quality detection and reversible repair for tabular data."
readme = "README.md"
license = "Apache-2.0"
requires-python = ">=3.11,<3.13"
keywords = ["data-quality", "ai-agent", "llm", "rl", "smt", "dbt"]
classifiers = [
    "Development Status :: 3 - Alpha",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
]
dependencies = [
    "pydantic>=2.7",
    "typer>=0.24,<0.25",
    "rich>=13.7",
    "textual>=8.2,<9",
    "z3-solver>=4.13",
    "pyyaml>=6.0",
]

[project.optional-dependencies]
bench = [
    "pandas>=2.2",
    "httpx>=0.27",
    "tenacity>=8.3",
    "python-dotenv>=1.0",
    "pyarrow>=16.0",
]
causal = [
    "pandas>=2.2",
    "numpy>=1.26",
    "networkx>=3.3",
    "causal-learn>=0.1.4",
    "hyppo>=0.5.2",
    "scipy>=1.13",
]
dev = [
    "pytest>=9.0.3",
    "pytest-cov>=5.0",
    "pytest-benchmark>=4.0",
    "pytest-xdist>=3.6",
    "hypothesis>=6.100",
    "mutmut>=3.5",
    "build>=1.2",
    "pip-audit>=2.10,<3",
    "cyclonedx-bom>=7.3,<8",
    "cryptography>=46.0.7",
    "idna>=3.15",
    "pip>=26.1.1",
    "urllib3>=2.7",
    "ruff>=0.11",
    "mypy>=1.10",
    "pandas-stubs>=2.2",
    "types-PyYAML",
    "huggingface_hub==1.13.0",
    "httpx>=0.27",
    "tenacity>=8.3",
    "python-dotenv>=1.0",
    "pyarrow>=16.0",
    "networkx>=3.3",
    "causal-learn>=0.1.4",
    "hyppo>=0.5.2",
    "scipy>=1.13",
    "sqlglot>=25.0",
    "duckdb>=1.0",
]
train = [
    "trl==1.4.0",
    "transformers==5.7.0",
    "accelerate==1.13.0",
    "peft==0.19.1",
    "bitsandbytes==0.49.2",
    "datasets==4.8.5",
    "huggingface_hub==1.13.0",
    "pyyaml==6.0.3",
    "pandas==2.3.3",
    "tensorboard==2.20.0",
]
eval = [
    "matplotlib>=3.9",
    "seaborn>=0.13",
]
providers = [
    "httpx>=0.27",
    "tenacity>=8.3",
    "python-dotenv>=1.0",
]
pandas = [
    "pandas>=2.2",
]
playground = [
    "pandas>=2.2",
    "fastapi>=0.136.1",
    "starlette>=1.0.1,<2",
    "uvicorn[standard]>=0.35",
    "python-multipart>=0.0.27",
    "slowapi>=0.1.9",
]
openenv = [
    "pandas>=2.2",
    "openenv-core[core]>=0.2.2",
    "authlib>=1.7.1,!=1.7.0",
    "cryptography>=46.0.7",
    "duckdb>=1.0",
    "sqlglot>=25.0",
    "scipy>=1.13",
    "networkx>=3.3",
    "causal-learn>=0.1.4",
    "hyppo>=0.5.2",
]
all = [
    "dataforge15[bench,causal,dev,eval,pandas,playground,providers,train,openenv]",
]

[project.scripts]
dataforge15 = "dataforge.cli:app"
dataforge = "dataforge.cli:app"

[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"

[tool.setuptools.packages.find]
where = ["."]
include = ["dataforge", "dataforge.*"]
exclude = ["data_quality_env", "data_quality_env.*"]

[tool.setuptools.package-data]
dataforge = [
    "py.typed",
    "fixtures/*.csv",
    "fixtures/*.yaml",
    "datasets/embedded/**/*.csv",
    "safety/constitutions/*.yaml",
    "safety/adversarial/*.yaml",
]

[tool.ruff]
line-length = 100
target-version = "py311"
extend-exclude = [".hf-space-repo", ".hf-space-stage", ".hf-space-stage-plan"]

[tool.ruff.lint]
select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "PIE", "RET", "SIM"]
ignore = ["E501"]

[tool.ruff.lint.per-file-ignores]
"data_quality_env/**/*.py" = ["B007", "B027", "E402", "E731", "F401", "F541", "F841", "I001", "N", "RET", "SIM", "UP"]
"training/kaggle/sft_warmup_kaggle.ipynb" = ["E402"]

[tool.ruff.format]
quote-style = "double"
indent-style = "space"
line-ending = "auto"

[tool.mypy]
strict = true
python_version = "3.11"
warn_unused_configs = true
warn_redundant_casts = true
warn_unused_ignores = true
disallow_untyped_defs = true
explicit_package_bases = true
exclude = [
    "^\\.hf-space-repo/",
    "^\\.hf-space-stage/",
    "^\\.hf-space-stage-plan/",
    "^[^/]*\\.py$",          # loose root-level scripts (hackathon legacy)
    "^(training|playground|benchmark_results|datasets)/",
]

[tool.pytest.ini_options]
minversion = "8.0"
addopts = "-ra --strict-markers --strict-config"
testpaths = ["tests"]
pythonpath = ["."]
markers = [
    "slow: marks tests as slow",
    "integration: marks tests as integration tests",
    "requires_network: tests that need internet access",
    "requires_llm: tests that call a free-tier LLM API",
]

[tool.coverage.run]
source = ["dataforge"]
branch = true

[tool.coverage.report]
fail_under = 90
exclude_lines = [
    "pragma: no cover",
    "raise NotImplementedError",
    "if TYPE_CHECKING:",
]