File size: 2,139 Bytes
968e24d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# src/summarization/dataset.py
import re
from pathlib import Path
from datasets import Dataset
from transformers import AutoTokenizer
from tqdm import tqdm

IMPORTANT_PATTERNS = [
    r"\bheld\b",
    r"\bwe conclude\b",
    r"\btherefore\b",
    r"\bappeal is (allowed|dismissed)\b",
    r"\bsubstantial question\b",
    r"\baccordingly\b",
]

def sentence_split(text):
    return re.split(r'(?<=[.!?])\s+', text)

def is_important(sentence):
    s = sentence.lower()
    return any(re.search(p, s) for p in IMPORTANT_PATTERNS)

def build_dataset(text_dir, tokenizer_name, limit=None):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    samples = []

    files = list(Path(text_dir).glob("*.txt"))
    if limit:
        files = files[:limit]
        
    for file in tqdm(files, desc="Processing judgments"):
        judgment_id = file.stem
        text = file.read_text(encoding="utf-8", errors="ignore")

        sentences = sentence_split(text)
        for sent in sentences:
            sent = sent.strip()
            if len(sent) < 40:
                continue

            samples.append({
                "text": sent,
                "label": int(is_important(sent)),
                "judgment_id": judgment_id
            })

    dataset = Dataset.from_list(samples)

    def tokenize(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=256
        )

    return dataset.map(tokenize, batched=True)

if __name__ == "__main__":
    print("Started dataset building...")
    # Using a limit of 1000 for training, can be increased later
    # 1000 judgments will yield ~50k-100k sentences, good for fine-tuning
    ds = build_dataset(
        "data/processed/extracted/texts",
        "nlpaueb/legal-bert-base-uncased",
        limit=1000
    )
    print("Tokenizing dataset... this may take a moment.")
    print(f"Total sentences extracted: {len(ds)}")
    
    print("Saving to Disk...")
    ds.save_to_disk("data/processed/summarization_dataset")
    print("✓ Dataset ready at data/processed/summarization_dataset")