Spaces:

satomito
/

contract-clause-analyzer

Paused

File size: 6,096 Bytes

908ff10

"""
fetch_cuad.py

Downloads & saves the 3 CUAD data artifacts:
  1. ../data/cuad/taxonomy.json -> 41 clause categories + descriptions
  2. ../data/cuad/contracts/<title>.txt -> 510 full contract texts
  3. ../data/cuad/annotations.json -> all labeled spans, organized by contract

Dataset structure (SQuAD-format):
    Each row: {id, title, context, question, answers: {text: [...], answer_start: [...]}}
    510 unique contracts x 41 clause-category questions = ~20,950 rows total (train + test).
    The same contract appears 41 times -- once per question.
    Question format: "Highlight the parts (if any) of this contract related to
                      '{Category Name}' that should be reviewed by a lawyer."
"""

import io
import json
import re
import urllib.request
import zipfile
from pathlib import Path

DATA_DIR = Path("../data/cuad")
CONTRACTS_DIR = DATA_DIR / "../contracts"

# Official CUAD data release (SQuAD-format JSON) from The Atticus Project
_CUAD_URL = "https://github.com/TheAtticusProject/cuad/raw/main/data.zip"

def load_cuad() -> list[dict]:
    """Download CUAD data.zip and return a flat list of SQuAD-format rows."""
    print("LOADING CUAD DATA FROM GITHUB")
    with urllib.request.urlopen(_CUAD_URL) as resp:
        data = resp.read()
    print(f"  downloaded {len(data) // 1024} KB")

    rows = []
    with zipfile.ZipFile(io.BytesIO(data)) as zf:
        json_files = [n for n in zf.namelist() if n.endswith(".json")]
        for name in json_files:
            squad = json.loads(zf.read(name))
            for article in squad["data"]:
                title = article["title"]
                for para in article["paragraphs"]:
                    context = para["context"]
                    for qa in para["qas"]:
                        rows.append({
                            "id": qa["id"],
                            "title": title,
                            "context": context,
                            "question": qa["question"],
                            "answers": {
                                "text": [a["text"] for a in qa["answers"]],
                                "answer_start": [a["answer_start"] for a in qa["answers"]],
                            },
                        })

    print(f"  {len(rows)} rows from {len(json_files)} file(s)")
    return rows

def extract_taxonomy(rows: list[dict]) -> None:
    """
    Extract all 41 clause categories

    Pattern for questions:
        "Highlight the parts (if any) of this contract related to
        '{Category Name}' that should be reviewed by a lawyer."

    The full question text is preserved as the category description since it is
    exactly the prompt the Classification Agent should use when labeling clauses

    Output:
        [{"id": 1, "name": "Document Name", "question": "<full question text>"}, ...]
    """
    seen: dict[str, str] = {} # name -> question text (deduped, insertion-ordered)

    for row in rows:
        question = row["question"]
        match = re.search(r'related to "(.+?)" that should be', question)
        if match:
            name = match.group(1)
            if name not in seen:
                seen[name] = question.strip()

    taxonomy = [
        {"id": i + 1, "name": name, "question": question}
        for i, (name, question) in enumerate(seen.items())
    ]

    out_path = DATA_DIR / "taxonomy.json"
    with open(out_path, "w") as f:
        json.dump(taxonomy, f, indent=2)

    print(f"  taxonomy.json: {len(taxonomy)} categories")


def extract_contracts(rows: list[dict]) -> None:
    """
    Extract the 510 unique contract full texts and save each as a .txt file.

    The `context` field holds the full contract text and is the same across
    all 41 rows for a given contract, so we just need the first occurrence.

    Output: data/cuad/contracts/<sanitized_title>.txt
    """
    CONTRACTS_DIR.mkdir(parents=True, exist_ok=True)
    seen: set[str] = set()

    for row in rows:
        title = row["title"]
        if title in seen:
            continue
        seen.add(title)

        safe_name = re.sub(r'[<>:"/\\|?*]', "_", title)
        out_path = CONTRACTS_DIR / f"{safe_name}.txt"
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(row["context"])

    print(f"  contracts/: {len(seen)} files")

def extract_annotations(rows: list[dict]) -> None:
    """
    Extract all the labeled spans and organize by contract title

    Each contract maps to a list of 41 annotation entries (one per category)
    An entry's `spans` list is empty if that clause type doesn't appear in the contract

    Output:
    {
      "<contract_title>": [
        {
          "category": "Governing Law",
          "question": "<full question text>",
          "spans": ["...annotated text...", ...],
          "span_starts": [482, ...]
        },
        ...41 entries per contract...
      ]
    }
    """
    annotations: dict[str, list] = {}

    for row in rows:
        title = row["title"]
        if title not in annotations:
            annotations[title] = []

        question = row["question"]
        match = re.search(r'related to "(.+?)" that should be', question)
        category_name = match.group(1) if match else "Unknown"

        annotations[title].append({
            "category": category_name,
            "question": question.strip(),
            "spans": row["answers"]["text"],
            "span_starts": row["answers"]["answer_start"],
        })

    out_path = DATA_DIR / "annotations.json"
    with open(out_path, "w") as f:
        json.dump(annotations, f, indent=2)

    total_spans = sum(
        len(entry["spans"])
        for entries in annotations.values()
        for entry in entries
    )
    print(f"  annotations.json: {len(annotations)} contracts, {total_spans} labeled spans")




def main():
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    rows = load_cuad()
    extract_taxonomy(rows)
    extract_contracts(rows)
    extract_annotations(rows)
    print("\nAll CUAD data saved to ../data/cuad/")

if __name__ == "__main__":
    main()