"""Central configuration: paths, seeds, model names, severity mapping.""" from __future__ import annotations from pathlib import Path ROOT = Path(__file__).resolve().parents[1] DATA_DIR = ROOT / "data" MODELS_DIR = ROOT / "models" OUTPUTS_DIR = ROOT / "outputs" NOTEBOOKS_DIR = ROOT / "notebooks" for _p in (DATA_DIR, MODELS_DIR, OUTPUTS_DIR): _p.mkdir(exist_ok=True, parents=True) CUAD_HF_ID = "dvgodoy/CUAD_v1_Contract_Understanding_clause_classification" CUAD_PARQUET = DATA_DIR / "cuad.parquet" EMBED_MODEL_NAME = "BAAI/bge-small-en-v1.5" EMBED_DIM = 384 SEED = 42 TEST_SIZE = 0.20 # EPCC severity tiers over CUAD's 41 clause labels, viewed from an EPC # contractor's perspective. "Critical" labels are the ones where a false # negative creates direct commercial exposure (uncapped liability, missing # LD/insurance terms, termination economics). "Moderate" affect rights and # downstream operations. "Low" are bookkeeping fields useful for indexing. SEVERITY_TIERS: dict[str, list[str]] = { "critical": [ "Uncapped Liability", "Cap On Liability", "Liquidated Damages", "Termination For Convenience", "Insurance", "Warranty Duration", "Covenant Not To Sue", "Source Code Escrow", ], "moderate": [ "Change Of Control", "Anti-Assignment", "Audit Rights", "Governing Law", "Non-Compete", "Exclusivity", "No-Solicit Of Customers", "No-Solicit Of Employees", "Non-Disparagement", "Rofr/Rofo/Rofn", "Revenue/Profit Sharing", "Price Restrictions", "Minimum Commitment", "Volume Restriction", "Ip Ownership Assignment", "Joint Ip Ownership", "License Grant", "Non-Transferable License", "Affiliate License-Licensor", "Affiliate License-Licensee", "Unlimited/All-You-Can-Eat-License", "Irrevocable Or Perpetual License", "Post-Termination Services", "Third Party Beneficiary", "Competitive Restriction Exception", "Notice Period To Terminate Renewal", "Renewal Term", ], "low": [ "Document Name", "Parties", "Agreement Date", "Effective Date", "Expiration Date", "Most Favored Nation", ], } def label_to_severity() -> dict[str, str]: """Flat label -> severity tier mapping.""" out: dict[str, str] = {} for tier, labels in SEVERITY_TIERS.items(): for lab in labels: out[lab] = tier return out # Confidence threshold below which the classifier abstains (routes to human # review). Chosen empirically from the calibration sweep and overridable. DEFAULT_ABSTAIN_THRESHOLD = 0.45