Spaces:
Running
Running
| # src/summarization/dataset.py | |
| import re | |
| from pathlib import Path | |
| from datasets import Dataset | |
| from transformers import AutoTokenizer | |
| from tqdm import tqdm | |
| IMPORTANT_PATTERNS = [ | |
| r"\bheld\b", | |
| r"\bwe conclude\b", | |
| r"\btherefore\b", | |
| r"\bappeal is (allowed|dismissed)\b", | |
| r"\bsubstantial question\b", | |
| r"\baccordingly\b", | |
| ] | |
| def sentence_split(text): | |
| return re.split(r'(?<=[.!?])\s+', text) | |
| def is_important(sentence): | |
| s = sentence.lower() | |
| return any(re.search(p, s) for p in IMPORTANT_PATTERNS) | |
| def build_dataset(text_dir, tokenizer_name, limit=None): | |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) | |
| samples = [] | |
| files = list(Path(text_dir).glob("*.txt")) | |
| if limit: | |
| files = files[:limit] | |
| for file in tqdm(files, desc="Processing judgments"): | |
| judgment_id = file.stem | |
| text = file.read_text(encoding="utf-8", errors="ignore") | |
| sentences = sentence_split(text) | |
| for sent in sentences: | |
| sent = sent.strip() | |
| if len(sent) < 40: | |
| continue | |
| samples.append({ | |
| "text": sent, | |
| "label": int(is_important(sent)), | |
| "judgment_id": judgment_id | |
| }) | |
| dataset = Dataset.from_list(samples) | |
| def tokenize(batch): | |
| return tokenizer( | |
| batch["text"], | |
| truncation=True, | |
| padding="max_length", | |
| max_length=256 | |
| ) | |
| return dataset.map(tokenize, batched=True) | |
| if __name__ == "__main__": | |
| print("Started dataset building...") | |
| # Using a limit of 1000 for training, can be increased later | |
| # 1000 judgments will yield ~50k-100k sentences, good for fine-tuning | |
| ds = build_dataset( | |
| "data/processed/extracted/texts", | |
| "nlpaueb/legal-bert-base-uncased", | |
| limit=1000 | |
| ) | |
| print("Tokenizing dataset... this may take a moment.") | |
| print(f"Total sentences extracted: {len(ds)}") | |
| print("Saving to Disk...") | |
| ds.save_to_disk("data/processed/summarization_dataset") | |
| print("✓ Dataset ready at data/processed/summarization_dataset") | |