File size: 6,096 Bytes
908ff10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""
fetch_cuad.py

Downloads & saves the 3 CUAD data artifacts:
  1. ../data/cuad/taxonomy.json -> 41 clause categories + descriptions
  2. ../data/cuad/contracts/<title>.txt -> 510 full contract texts
  3. ../data/cuad/annotations.json -> all labeled spans, organized by contract

Dataset structure (SQuAD-format):
    Each row: {id, title, context, question, answers: {text: [...], answer_start: [...]}}
    510 unique contracts x 41 clause-category questions = ~20,950 rows total (train + test).
    The same contract appears 41 times -- once per question.
    Question format: "Highlight the parts (if any) of this contract related to
                      '{Category Name}' that should be reviewed by a lawyer."
"""

import io
import json
import re
import urllib.request
import zipfile
from pathlib import Path

DATA_DIR = Path("../data/cuad")
CONTRACTS_DIR = DATA_DIR / "../contracts"

# Official CUAD data release (SQuAD-format JSON) from The Atticus Project
_CUAD_URL = "https://github.com/TheAtticusProject/cuad/raw/main/data.zip"

def load_cuad() -> list[dict]:
    """Download CUAD data.zip and return a flat list of SQuAD-format rows."""
    print("LOADING CUAD DATA FROM GITHUB")
    with urllib.request.urlopen(_CUAD_URL) as resp:
        data = resp.read()
    print(f"  downloaded {len(data) // 1024} KB")

    rows = []
    with zipfile.ZipFile(io.BytesIO(data)) as zf:
        json_files = [n for n in zf.namelist() if n.endswith(".json")]
        for name in json_files:
            squad = json.loads(zf.read(name))
            for article in squad["data"]:
                title = article["title"]
                for para in article["paragraphs"]:
                    context = para["context"]
                    for qa in para["qas"]:
                        rows.append({
                            "id": qa["id"],
                            "title": title,
                            "context": context,
                            "question": qa["question"],
                            "answers": {
                                "text": [a["text"] for a in qa["answers"]],
                                "answer_start": [a["answer_start"] for a in qa["answers"]],
                            },
                        })

    print(f"  {len(rows)} rows from {len(json_files)} file(s)")
    return rows

def extract_taxonomy(rows: list[dict]) -> None:
    """
    Extract all 41 clause categories

    Pattern for questions:
        "Highlight the parts (if any) of this contract related to
        '{Category Name}' that should be reviewed by a lawyer."

    The full question text is preserved as the category description since it is
    exactly the prompt the Classification Agent should use when labeling clauses

    Output:
        [{"id": 1, "name": "Document Name", "question": "<full question text>"}, ...]
    """
    seen: dict[str, str] = {} # name -> question text (deduped, insertion-ordered)

    for row in rows:
        question = row["question"]
        match = re.search(r'related to "(.+?)" that should be', question)
        if match:
            name = match.group(1)
            if name not in seen:
                seen[name] = question.strip()

    taxonomy = [
        {"id": i + 1, "name": name, "question": question}
        for i, (name, question) in enumerate(seen.items())
    ]

    out_path = DATA_DIR / "taxonomy.json"
    with open(out_path, "w") as f:
        json.dump(taxonomy, f, indent=2)

    print(f"  taxonomy.json: {len(taxonomy)} categories")


def extract_contracts(rows: list[dict]) -> None:
    """
    Extract the 510 unique contract full texts and save each as a .txt file.

    The `context` field holds the full contract text and is the same across
    all 41 rows for a given contract, so we just need the first occurrence.

    Output: data/cuad/contracts/<sanitized_title>.txt
    """
    CONTRACTS_DIR.mkdir(parents=True, exist_ok=True)
    seen: set[str] = set()

    for row in rows:
        title = row["title"]
        if title in seen:
            continue
        seen.add(title)

        safe_name = re.sub(r'[<>:"/\\|?*]', "_", title)
        out_path = CONTRACTS_DIR / f"{safe_name}.txt"
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(row["context"])

    print(f"  contracts/: {len(seen)} files")

def extract_annotations(rows: list[dict]) -> None:
    """
    Extract all the labeled spans and organize by contract title

    Each contract maps to a list of 41 annotation entries (one per category)
    An entry's `spans` list is empty if that clause type doesn't appear in the contract

    Output:
    {
      "<contract_title>": [
        {
          "category": "Governing Law",
          "question": "<full question text>",
          "spans": ["...annotated text...", ...],
          "span_starts": [482, ...]
        },
        ...41 entries per contract...
      ]
    }
    """
    annotations: dict[str, list] = {}

    for row in rows:
        title = row["title"]
        if title not in annotations:
            annotations[title] = []

        question = row["question"]
        match = re.search(r'related to "(.+?)" that should be', question)
        category_name = match.group(1) if match else "Unknown"

        annotations[title].append({
            "category": category_name,
            "question": question.strip(),
            "spans": row["answers"]["text"],
            "span_starts": row["answers"]["answer_start"],
        })

    out_path = DATA_DIR / "annotations.json"
    with open(out_path, "w") as f:
        json.dump(annotations, f, indent=2)

    total_spans = sum(
        len(entry["spans"])
        for entries in annotations.values()
        for entry in entries
    )
    print(f"  annotations.json: {len(annotations)} contracts, {total_spans} labeled spans")




def main():
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    rows = load_cuad()
    extract_taxonomy(rows)
    extract_contracts(rows)
    extract_annotations(rows)
    print("\nAll CUAD data saved to ../data/cuad/")

if __name__ == "__main__":
    main()