maralzar
Initial commit: EPCC clause classifier Streamlit demo for HF Spaces
5212b8e
Raw
History Blame Contribute Delete
2.77 kB
"""Central configuration: paths, seeds, model names, severity mapping."""
from __future__ import annotations
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DATA_DIR = ROOT / "data"
MODELS_DIR = ROOT / "models"
OUTPUTS_DIR = ROOT / "outputs"
NOTEBOOKS_DIR = ROOT / "notebooks"
for _p in (DATA_DIR, MODELS_DIR, OUTPUTS_DIR):
_p.mkdir(exist_ok=True, parents=True)
CUAD_HF_ID = "dvgodoy/CUAD_v1_Contract_Understanding_clause_classification"
CUAD_PARQUET = DATA_DIR / "cuad.parquet"
EMBED_MODEL_NAME = "BAAI/bge-small-en-v1.5"
EMBED_DIM = 384
SEED = 42
TEST_SIZE = 0.20
# EPCC severity tiers over CUAD's 41 clause labels, viewed from an EPC
# contractor's perspective. "Critical" labels are the ones where a false
# negative creates direct commercial exposure (uncapped liability, missing
# LD/insurance terms, termination economics). "Moderate" affect rights and
# downstream operations. "Low" are bookkeeping fields useful for indexing.
SEVERITY_TIERS: dict[str, list[str]] = {
"critical": [
"Uncapped Liability",
"Cap On Liability",
"Liquidated Damages",
"Termination For Convenience",
"Insurance",
"Warranty Duration",
"Covenant Not To Sue",
"Source Code Escrow",
],
"moderate": [
"Change Of Control",
"Anti-Assignment",
"Audit Rights",
"Governing Law",
"Non-Compete",
"Exclusivity",
"No-Solicit Of Customers",
"No-Solicit Of Employees",
"Non-Disparagement",
"Rofr/Rofo/Rofn",
"Revenue/Profit Sharing",
"Price Restrictions",
"Minimum Commitment",
"Volume Restriction",
"Ip Ownership Assignment",
"Joint Ip Ownership",
"License Grant",
"Non-Transferable License",
"Affiliate License-Licensor",
"Affiliate License-Licensee",
"Unlimited/All-You-Can-Eat-License",
"Irrevocable Or Perpetual License",
"Post-Termination Services",
"Third Party Beneficiary",
"Competitive Restriction Exception",
"Notice Period To Terminate Renewal",
"Renewal Term",
],
"low": [
"Document Name",
"Parties",
"Agreement Date",
"Effective Date",
"Expiration Date",
"Most Favored Nation",
],
}
def label_to_severity() -> dict[str, str]:
"""Flat label -> severity tier mapping."""
out: dict[str, str] = {}
for tier, labels in SEVERITY_TIERS.items():
for lab in labels:
out[lab] = tier
return out
# Confidence threshold below which the classifier abstains (routes to human
# review). Chosen empirically from the calibration sweep and overridable.
DEFAULT_ABSTAIN_THRESHOLD = 0.45