Spaces:
Sleeping
Sleeping
| """Central configuration: paths, seeds, model names, severity mapping.""" | |
| from __future__ import annotations | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parents[1] | |
| DATA_DIR = ROOT / "data" | |
| MODELS_DIR = ROOT / "models" | |
| OUTPUTS_DIR = ROOT / "outputs" | |
| NOTEBOOKS_DIR = ROOT / "notebooks" | |
| for _p in (DATA_DIR, MODELS_DIR, OUTPUTS_DIR): | |
| _p.mkdir(exist_ok=True, parents=True) | |
| CUAD_HF_ID = "dvgodoy/CUAD_v1_Contract_Understanding_clause_classification" | |
| CUAD_PARQUET = DATA_DIR / "cuad.parquet" | |
| EMBED_MODEL_NAME = "BAAI/bge-small-en-v1.5" | |
| EMBED_DIM = 384 | |
| SEED = 42 | |
| TEST_SIZE = 0.20 | |
| # EPCC severity tiers over CUAD's 41 clause labels, viewed from an EPC | |
| # contractor's perspective. "Critical" labels are the ones where a false | |
| # negative creates direct commercial exposure (uncapped liability, missing | |
| # LD/insurance terms, termination economics). "Moderate" affect rights and | |
| # downstream operations. "Low" are bookkeeping fields useful for indexing. | |
| SEVERITY_TIERS: dict[str, list[str]] = { | |
| "critical": [ | |
| "Uncapped Liability", | |
| "Cap On Liability", | |
| "Liquidated Damages", | |
| "Termination For Convenience", | |
| "Insurance", | |
| "Warranty Duration", | |
| "Covenant Not To Sue", | |
| "Source Code Escrow", | |
| ], | |
| "moderate": [ | |
| "Change Of Control", | |
| "Anti-Assignment", | |
| "Audit Rights", | |
| "Governing Law", | |
| "Non-Compete", | |
| "Exclusivity", | |
| "No-Solicit Of Customers", | |
| "No-Solicit Of Employees", | |
| "Non-Disparagement", | |
| "Rofr/Rofo/Rofn", | |
| "Revenue/Profit Sharing", | |
| "Price Restrictions", | |
| "Minimum Commitment", | |
| "Volume Restriction", | |
| "Ip Ownership Assignment", | |
| "Joint Ip Ownership", | |
| "License Grant", | |
| "Non-Transferable License", | |
| "Affiliate License-Licensor", | |
| "Affiliate License-Licensee", | |
| "Unlimited/All-You-Can-Eat-License", | |
| "Irrevocable Or Perpetual License", | |
| "Post-Termination Services", | |
| "Third Party Beneficiary", | |
| "Competitive Restriction Exception", | |
| "Notice Period To Terminate Renewal", | |
| "Renewal Term", | |
| ], | |
| "low": [ | |
| "Document Name", | |
| "Parties", | |
| "Agreement Date", | |
| "Effective Date", | |
| "Expiration Date", | |
| "Most Favored Nation", | |
| ], | |
| } | |
| def label_to_severity() -> dict[str, str]: | |
| """Flat label -> severity tier mapping.""" | |
| out: dict[str, str] = {} | |
| for tier, labels in SEVERITY_TIERS.items(): | |
| for lab in labels: | |
| out[lab] = tier | |
| return out | |
| # Confidence threshold below which the classifier abstains (routes to human | |
| # review). Chosen empirically from the calibration sweep and overridable. | |
| DEFAULT_ABSTAIN_THRESHOLD = 0.45 | |