satomitheito's picture
hf streamlit
908ff10
"""
fetch_cuad.py
Downloads & saves the 3 CUAD data artifacts:
1. ../data/cuad/taxonomy.json -> 41 clause categories + descriptions
2. ../data/cuad/contracts/<title>.txt -> 510 full contract texts
3. ../data/cuad/annotations.json -> all labeled spans, organized by contract
Dataset structure (SQuAD-format):
Each row: {id, title, context, question, answers: {text: [...], answer_start: [...]}}
510 unique contracts x 41 clause-category questions = ~20,950 rows total (train + test).
The same contract appears 41 times -- once per question.
Question format: "Highlight the parts (if any) of this contract related to
'{Category Name}' that should be reviewed by a lawyer."
"""
import io
import json
import re
import urllib.request
import zipfile
from pathlib import Path
DATA_DIR = Path("../data/cuad")
CONTRACTS_DIR = DATA_DIR / "../contracts"
# Official CUAD data release (SQuAD-format JSON) from The Atticus Project
_CUAD_URL = "https://github.com/TheAtticusProject/cuad/raw/main/data.zip"
def load_cuad() -> list[dict]:
"""Download CUAD data.zip and return a flat list of SQuAD-format rows."""
print("LOADING CUAD DATA FROM GITHUB")
with urllib.request.urlopen(_CUAD_URL) as resp:
data = resp.read()
print(f" downloaded {len(data) // 1024} KB")
rows = []
with zipfile.ZipFile(io.BytesIO(data)) as zf:
json_files = [n for n in zf.namelist() if n.endswith(".json")]
for name in json_files:
squad = json.loads(zf.read(name))
for article in squad["data"]:
title = article["title"]
for para in article["paragraphs"]:
context = para["context"]
for qa in para["qas"]:
rows.append({
"id": qa["id"],
"title": title,
"context": context,
"question": qa["question"],
"answers": {
"text": [a["text"] for a in qa["answers"]],
"answer_start": [a["answer_start"] for a in qa["answers"]],
},
})
print(f" {len(rows)} rows from {len(json_files)} file(s)")
return rows
def extract_taxonomy(rows: list[dict]) -> None:
"""
Extract all 41 clause categories
Pattern for questions:
"Highlight the parts (if any) of this contract related to
'{Category Name}' that should be reviewed by a lawyer."
The full question text is preserved as the category description since it is
exactly the prompt the Classification Agent should use when labeling clauses
Output:
[{"id": 1, "name": "Document Name", "question": "<full question text>"}, ...]
"""
seen: dict[str, str] = {} # name -> question text (deduped, insertion-ordered)
for row in rows:
question = row["question"]
match = re.search(r'related to "(.+?)" that should be', question)
if match:
name = match.group(1)
if name not in seen:
seen[name] = question.strip()
taxonomy = [
{"id": i + 1, "name": name, "question": question}
for i, (name, question) in enumerate(seen.items())
]
out_path = DATA_DIR / "taxonomy.json"
with open(out_path, "w") as f:
json.dump(taxonomy, f, indent=2)
print(f" taxonomy.json: {len(taxonomy)} categories")
def extract_contracts(rows: list[dict]) -> None:
"""
Extract the 510 unique contract full texts and save each as a .txt file.
The `context` field holds the full contract text and is the same across
all 41 rows for a given contract, so we just need the first occurrence.
Output: data/cuad/contracts/<sanitized_title>.txt
"""
CONTRACTS_DIR.mkdir(parents=True, exist_ok=True)
seen: set[str] = set()
for row in rows:
title = row["title"]
if title in seen:
continue
seen.add(title)
safe_name = re.sub(r'[<>:"/\\|?*]', "_", title)
out_path = CONTRACTS_DIR / f"{safe_name}.txt"
with open(out_path, "w", encoding="utf-8") as f:
f.write(row["context"])
print(f" contracts/: {len(seen)} files")
def extract_annotations(rows: list[dict]) -> None:
"""
Extract all the labeled spans and organize by contract title
Each contract maps to a list of 41 annotation entries (one per category)
An entry's `spans` list is empty if that clause type doesn't appear in the contract
Output:
{
"<contract_title>": [
{
"category": "Governing Law",
"question": "<full question text>",
"spans": ["...annotated text...", ...],
"span_starts": [482, ...]
},
...41 entries per contract...
]
}
"""
annotations: dict[str, list] = {}
for row in rows:
title = row["title"]
if title not in annotations:
annotations[title] = []
question = row["question"]
match = re.search(r'related to "(.+?)" that should be', question)
category_name = match.group(1) if match else "Unknown"
annotations[title].append({
"category": category_name,
"question": question.strip(),
"spans": row["answers"]["text"],
"span_starts": row["answers"]["answer_start"],
})
out_path = DATA_DIR / "annotations.json"
with open(out_path, "w") as f:
json.dump(annotations, f, indent=2)
total_spans = sum(
len(entry["spans"])
for entries in annotations.values()
for entry in entries
)
print(f" annotations.json: {len(annotations)} contracts, {total_spans} labeled spans")
def main():
DATA_DIR.mkdir(parents=True, exist_ok=True)
rows = load_cuad()
extract_taxonomy(rows)
extract_contracts(rows)
extract_annotations(rows)
print("\nAll CUAD data saved to ../data/cuad/")
if __name__ == "__main__":
main()