[project] name = "dataforge15" version = "0.1.0rc1" description = "DataForge15: CLI-first data-quality detection and reversible repair for tabular data." readme = "README.md" license = "Apache-2.0" requires-python = ">=3.11,<3.13" keywords = ["data-quality", "ai-agent", "llm", "rl", "smt", "dbt"] classifiers = [ "Development Status :: 3 - Alpha", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] dependencies = [ "pydantic>=2.7", "typer>=0.24,<0.25", "rich>=13.7", "textual>=8.2,<9", "z3-solver>=4.13", "pyyaml>=6.0", ] [project.optional-dependencies] bench = [ "pandas>=2.2", "httpx>=0.27", "tenacity>=8.3", "python-dotenv>=1.0", "pyarrow>=16.0", ] causal = [ "pandas>=2.2", "numpy>=1.26", "networkx>=3.3", "causal-learn>=0.1.4", "hyppo>=0.5.2", "scipy>=1.13", ] dev = [ "pytest>=9.0.3", "pytest-cov>=5.0", "pytest-benchmark>=4.0", "pytest-xdist>=3.6", "hypothesis>=6.100", "mutmut>=3.5", "build>=1.2", "pip-audit>=2.10,<3", "cyclonedx-bom>=7.3,<8", "cryptography>=46.0.7", "idna>=3.15", "pip>=26.1.1", "urllib3>=2.7", "ruff>=0.11", "mypy>=1.10", "pandas-stubs>=2.2", "types-PyYAML", "huggingface_hub==1.13.0", "httpx>=0.27", "tenacity>=8.3", "python-dotenv>=1.0", "pyarrow>=16.0", "networkx>=3.3", "causal-learn>=0.1.4", "hyppo>=0.5.2", "scipy>=1.13", "sqlglot>=25.0", "duckdb>=1.0", ] train = [ "trl==1.4.0", "transformers==5.7.0", "accelerate==1.13.0", "peft==0.19.1", "bitsandbytes==0.49.2", "datasets==4.8.5", "huggingface_hub==1.13.0", "pyyaml==6.0.3", "pandas==2.3.3", "tensorboard==2.20.0", ] eval = [ "matplotlib>=3.9", "seaborn>=0.13", ] providers = [ "httpx>=0.27", "tenacity>=8.3", "python-dotenv>=1.0", ] pandas = [ "pandas>=2.2", ] playground = [ "pandas>=2.2", "fastapi>=0.136.1", "starlette>=1.0.1,<2", "uvicorn[standard]>=0.35", "python-multipart>=0.0.27", "slowapi>=0.1.9", ] openenv = [ "pandas>=2.2", "openenv-core[core]>=0.2.2", "authlib>=1.7.1,!=1.7.0", "cryptography>=46.0.7", "duckdb>=1.0", "sqlglot>=25.0", "scipy>=1.13", "networkx>=3.3", "causal-learn>=0.1.4", "hyppo>=0.5.2", ] all = [ "dataforge15[bench,causal,dev,eval,pandas,playground,providers,train,openenv]", ] [project.scripts] dataforge15 = "dataforge.cli:app" dataforge = "dataforge.cli:app" [build-system] requires = ["setuptools>=68", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] where = ["."] include = ["dataforge", "dataforge.*"] exclude = ["data_quality_env", "data_quality_env.*"] [tool.setuptools.package-data] dataforge = [ "py.typed", "fixtures/*.csv", "fixtures/*.yaml", "datasets/embedded/**/*.csv", "safety/constitutions/*.yaml", "safety/adversarial/*.yaml", ] [tool.ruff] line-length = 100 target-version = "py311" extend-exclude = [".hf-space-repo", ".hf-space-stage", ".hf-space-stage-plan"] [tool.ruff.lint] select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "PIE", "RET", "SIM"] ignore = ["E501"] [tool.ruff.lint.per-file-ignores] "data_quality_env/**/*.py" = ["B007", "B027", "E402", "E731", "F401", "F541", "F841", "I001", "N", "RET", "SIM", "UP"] "training/kaggle/sft_warmup_kaggle.ipynb" = ["E402"] [tool.ruff.format] quote-style = "double" indent-style = "space" line-ending = "auto" [tool.mypy] strict = true python_version = "3.11" warn_unused_configs = true warn_redundant_casts = true warn_unused_ignores = true disallow_untyped_defs = true explicit_package_bases = true exclude = [ "^\\.hf-space-repo/", "^\\.hf-space-stage/", "^\\.hf-space-stage-plan/", "^[^/]*\\.py$", # loose root-level scripts (hackathon legacy) "^(training|playground|benchmark_results|datasets)/", ] [tool.pytest.ini_options] minversion = "8.0" addopts = "-ra --strict-markers --strict-config" testpaths = ["tests"] pythonpath = ["."] markers = [ "slow: marks tests as slow", "integration: marks tests as integration tests", "requires_network: tests that need internet access", "requires_llm: tests that call a free-tier LLM API", ] [tool.coverage.run] source = ["dataforge"] branch = true [tool.coverage.report] fail_under = 90 exclude_lines = [ "pragma: no cover", "raise NotImplementedError", "if TYPE_CHECKING:", ]