Soundaryasos commited on
Commit
32cccf7
·
verified ·
1 Parent(s): 5053c88

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +793 -0
app.py ADDED
@@ -0,0 +1,793 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy
3
+ import pandas as pd
4
+ from typing import List, Dict, Tuple, Optional
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
6
+ import fitz # PyMuPDF
7
+ import docx
8
+ from bs4 import BeautifulSoup
9
+ import nltk
10
+ from nltk.tokenize import sent_tokenize
11
+ import numpy as np
12
+ import torch
13
+ import networkx as nx
14
+ from sklearn.feature_extraction.text import TfidfVectorizer
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+ from sentence_transformers import SentenceTransformer
17
+
18
+ # Download necessary NLTK data
19
+ nltk.download('punkt')
20
+
21
+ # Load legal-specific NLP model
22
+ nlp = spacy.load("en_core_web_lg")
23
+
24
+ class LegalDocumentProcessor:
25
+ """
26
+ A comprehensive pipeline for processing legal documents.
27
+ Handles document loading, text extraction, preprocessing, and tokenization.
28
+ """
29
+
30
+ def __init__(self, tokenizer_name: str = "nlpaueb/legal-bert-base-uncased"):
31
+ """
32
+ Initialize the legal document processor.
33
+ Args:
34
+ tokenizer_name: The HuggingFace tokenizer to use for transformer models
35
+ """
36
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
37
+
38
+ # Legal-specific patterns
39
+ self.legal_abbreviations = {
40
+ "et al.": "and others",
41
+ "i.e.": "that is",
42
+ "e.g.": "for example",
43
+ "v.": "versus",
44
+ "cf.": "compare",
45
+ "viz.": "namely",
46
+ "ex rel.": "on behalf of",
47
+ "etc.": "etcetera"
48
+ }
49
+
50
+ # Regular expressions for legal citations and references
51
+ self.citation_pattern = re.compile(r'\d+\s+[A-Za-z\.]+\s+\d+')
52
+ self.section_pattern = re.compile(r'Section\s+\d+\.\d+', re.IGNORECASE)
53
+
54
+ # Legal boilerplate text patterns
55
+ self.boilerplate_patterns = [
56
+ r"IN WITNESS WHEREOF.*",
57
+ r"WHEREAS,.*",
58
+ r"NOW, THEREFORE,.*",
59
+ r"The parties hereby agree as follows:.*"
60
+ ]
61
+ self.boilerplate_regex = re.compile('|'.join(self.boilerplate_patterns), re.DOTALL)
62
+
63
+ def extract_text_from_file(self, file_path: str) -> str:
64
+ """
65
+ Extract text from various file formats (PDF, DOCX, TXT, HTML).
66
+ Args:
67
+ file_path: Path to the legal document file
68
+ Returns:
69
+ Extracted text as a string
70
+ """
71
+ file_extension = file_path.split('.')[-1].lower()
72
+ if file_extension == 'pdf':
73
+ return self._extract_from_pdf(file_path)
74
+ elif file_extension in ['docx', 'doc']:
75
+ return self._extract_from_docx(file_path)
76
+ elif file_extension == 'txt':
77
+ with open(file_path, 'r', encoding='utf-8') as f:
78
+ return f.read()
79
+ elif file_extension in ['html', 'htm']:
80
+ return self._extract_from_html(file_path)
81
+ else:
82
+ raise ValueError(f"Unsupported file format: {file_extension}")
83
+
84
+ def _extract_from_pdf(self, file_path: str) -> str:
85
+ """Extract text from PDF files"""
86
+ doc = fitz.open(file_path)
87
+ text = ""
88
+ for page in doc:
89
+ text += page.get_text()
90
+ return text
91
+
92
+ def _extract_from_docx(self, file_path: str) -> str:
93
+ """Extract text from DOCX files"""
94
+ doc = docx.Document(file_path)
95
+ return '\n'.join([para.text for para in doc.paragraphs])
96
+
97
+ def _extract_from_html(self, file_path: str) -> str:
98
+ """Extract text from HTML files"""
99
+ with open(file_path, 'r', encoding='utf-8') as f:
100
+ soup = BeautifulSoup(f.read(), 'html.parser')
101
+ return soup.get_text()
102
+
103
+ def preprocess_text(self, text: str) -> str:
104
+ """
105
+ Preprocess legal text by:
106
+ - Expanding abbreviations
107
+ - Removing redundant whitespace
108
+ - Handling special characters
109
+ - Maintaining sentence structure
110
+ Args:
111
+ text: Raw text extracted from a legal document
112
+ Returns:
113
+ Preprocessed text
114
+ """
115
+ # Replace legal abbreviations
116
+ for abbr, expansion in self.legal_abbreviations.items():
117
+ text = re.sub(r'\b' + re.escape(abbr) + r'\b', expansion, text)
118
+
119
+ # Remove redundant whitespace
120
+ text = re.sub(r'\s+', ' ', text)
121
+
122
+ # Separate citation references to prevent them from merging with sentences
123
+ text = re.sub(self.citation_pattern, r' \g<0> ', text)
124
+
125
+ # Handle section references
126
+ text = re.sub(self.section_pattern, r' \g<0> ', text)
127
+
128
+ # Normalize newlines to separate sections properly
129
+ text = re.sub(r'\n+', '\n', text)
130
+
131
+ return text.strip()
132
+
133
+ def identify_document_structure(self, text: str) -> Dict[str, List[str]]:
134
+ """
135
+ Identify key structural elements in the legal document.
136
+ Args:
137
+ text: Preprocessed legal document text
138
+ Returns:
139
+ Dictionary containing identified sections
140
+ """
141
+ # Split into sections based on headers
142
+ sections = {}
143
+
144
+ # Identify potential headers (uppercase text followed by newline)
145
+ potential_headers = re.finditer(r'([A-Z][A-Z\s]+[A-Z])[:\.\n]', text)
146
+
147
+ # Extract sections based on identified headers
148
+ last_pos = 0
149
+ last_header = "PREAMBLE"
150
+ for match in potential_headers:
151
+ header = match.group(1).strip()
152
+ start_pos = match.start()
153
+
154
+ # Add the previous section
155
+ if last_pos < start_pos:
156
+ sections[last_header] = text[last_pos:start_pos].strip()
157
+
158
+ last_pos = match.end()
159
+ last_header = header
160
+
161
+ # Add the final section
162
+ if last_pos < len(text):
163
+ sections[last_header] = text[last_pos:].strip()
164
+
165
+ return sections
166
+
167
+ def extract_sentences(self, text: str) -> List[str]:
168
+ """
169
+ Split text into sentences, handling legal-specific patterns.
170
+ Args:
171
+ text: Preprocessed legal document text
172
+ Returns:
173
+ List of sentences
174
+ """
175
+ # Use NLTK's sentence tokenizer as a base
176
+ sentences = sent_tokenize(text)
177
+
178
+ # Post-process to handle potential issues with legal text
179
+ processed_sentences = []
180
+ for sentence in sentences:
181
+ # Skip empty sentences
182
+ if not sentence.strip():
183
+ continue
184
+
185
+ # Clean up sentences
186
+ sentence = sentence.strip()
187
+
188
+ # Check if sentence is too long (might be incorrectly split)
189
+ if len(sentence) > 500:
190
+ # Try to break it further at punctuation marks
191
+ sub_sentences = re.split(r'[;:](?=\s)', sentence)
192
+ processed_sentences.extend([s.strip() for s in sub_sentences if s.strip()])
193
+ else:
194
+ processed_sentences.append(sentence)
195
+
196
+ return processed_sentences
197
+
198
+ def tokenize_for_transformer(self, text: str, max_length: int = 512) -> Dict:
199
+ """
200
+ Tokenize text for transformer-based models.
201
+ Args:
202
+ text: Input text to tokenize
203
+ max_length: Maximum token length for the model
204
+ Returns:
205
+ Tokenized input dict ready for transformer models
206
+ """
207
+ return self.tokenizer(
208
+ text,
209
+ padding="max_length",
210
+ truncation=True,
211
+ max_length=max_length,
212
+ return_tensors="pt"
213
+ )
214
+
215
+ def extract_entities(self, text: str) -> List[Dict]:
216
+ """
217
+ Extract legal entities from text using spaCy.
218
+ Args:
219
+ text: Legal document text
220
+ Returns:
221
+ List of extracted entities with type information
222
+ """
223
+ doc = nlp(text)
224
+ entities = []
225
+ for ent in doc.ents:
226
+ entities.append({
227
+ "text": ent.text,
228
+ "start": ent.start_char,
229
+ "end": ent.end_char,
230
+ "type": ent.label_
231
+ })
232
+
233
+ # Additional legal entity extraction for common patterns
234
+ # Extract case citations
235
+ case_citations = re.finditer(r'[A-Za-z\s]+ v\. [A-Za-z\s]+,?\s+\d+\s+[A-Za-z\.]+\s+\d+', text)
236
+ for match in case_citations:
237
+ entities.append({
238
+ "text": match.group(0),
239
+ "start": match.start(),
240
+ "end": match.end(),
241
+ "type": "CASE_CITATION"
242
+ })
243
+
244
+ # Extract statutory references
245
+ statutes = re.finditer(r'\d+\s+U\.S\.C\.\s+§\s+\d+', text)
246
+ for match in statutes:
247
+ entities.append({
248
+ "text": match.group(0),
249
+ "start": match.start(),
250
+ "end": match.end(),
251
+ "type": "STATUTE"
252
+ })
253
+
254
+ return entities
255
+
256
+ def chunk_document(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
257
+ """
258
+ Split document into overlapping chunks for processing.
259
+ Args:
260
+ text: Document text
261
+ chunk_size: Approximate size of each chunk in characters
262
+ overlap: Number of characters to overlap between chunks
263
+ Returns:
264
+ List of document chunks
265
+ """
266
+ # First split by sentences
267
+ sentences = self.extract_sentences(text)
268
+ chunks = []
269
+ current_chunk = []
270
+ current_length = 0
271
+ for sentence in sentences:
272
+ sentence_length = len(sentence)
273
+
274
+ # If adding this sentence would exceed chunk size
275
+ if current_length + sentence_length > chunk_size and current_chunk:
276
+ # Add the current chunk to our list of chunks
277
+ chunks.append(' '.join(current_chunk))
278
+
279
+ # Start a new chunk with overlap
280
+ # Find sentences to keep for overlap
281
+ overlap_chars = 0
282
+ overlap_sentences = []
283
+ for s in reversed(current_chunk):
284
+ if overlap_chars + len(s) <= overlap:
285
+ overlap_sentences.insert(0, s)
286
+ overlap_chars += len(s) + 1 # +1 for the space
287
+ else:
288
+ break
289
+
290
+ current_chunk = overlap_sentences
291
+ current_length = overlap_chars
292
+
293
+ current_chunk.append(sentence)
294
+ current_length += sentence_length + 1 # +1 for the space
295
+
296
+ # Don't forget the last chunk
297
+ if current_chunk:
298
+ chunks.append(' '.join(current_chunk))
299
+
300
+ return chunks
301
+
302
+ def process_document(self, file_path: str) -> Dict:
303
+ """
304
+ Complete processing pipeline for a legal document.
305
+ Args:
306
+ file_path: Path to the legal document
307
+ Returns:
308
+ Dictionary containing processed document information
309
+ """
310
+ # Extract text from file
311
+ raw_text = self.extract_text_from_file(file_path)
312
+
313
+ # Preprocess the text
314
+ preprocessed_text = self.preprocess_text(raw_text)
315
+
316
+ # Identify document structure
317
+ structure = self.identify_document_structure(preprocessed_text)
318
+
319
+ # Extract sentences
320
+ sentences = self.extract_sentences(preprocessed_text)
321
+
322
+ # Chunk document for processing
323
+ chunks = self.chunk_document(preprocessed_text)
324
+
325
+ # Extract entities
326
+ entities = self.extract_entities(preprocessed_text)
327
+
328
+ return {
329
+ "raw_text": raw_text,
330
+ "preprocessed_text": preprocessed_text,
331
+ "structure": structure,
332
+ "sentences": sentences,
333
+ "chunks": chunks,
334
+ "entities": entities
335
+ }
336
+
337
+ class LegalSummarizer:
338
+ """
339
+ A comprehensive summarization engine for legal documents that implements
340
+ both extractive and abstractive summarization techniques.
341
+ """
342
+
343
+ def __init__(
344
+ self,
345
+ extractive_model: str = "sentence-transformers/all-MiniLM-L6-v2",
346
+ abstractive_model: str = "facebook/bart-large-cnn",
347
+ use_gpu: bool = torch.cuda.is_available()
348
+ ):
349
+ """
350
+ Initialize the legal summarization engine.
351
+ Args:
352
+ extractive_model: Model name for sentence embeddings (extractive)
353
+ abstractive_model: Model name for seq2seq summarization (abstractive)
354
+ use_gpu: Whether to use GPU for inference
355
+ """
356
+ self.device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
357
+
358
+ # Load models
359
+ print(f"Loading extractive model: {extractive_model}")
360
+ self.sentence_model = SentenceTransformer(extractive_model)
361
+ self.sentence_model.to(self.device)
362
+
363
+ print(f"Loading abstractive model: {abstractive_model}")
364
+ self.abstractive_tokenizer = AutoTokenizer.from_pretrained(abstractive_model)
365
+ self.abstractive_model = AutoModelForSeq2SeqLM.from_pretrained(abstractive_model)
366
+ self.abstractive_model.to(self.device)
367
+
368
+ # Initialize TF-IDF vectorizer for keyword extraction
369
+ self.tfidf_vectorizer = TfidfVectorizer(
370
+ max_features=5000,
371
+ stop_words='english',
372
+ ngram_range=(1, 2)
373
+ )
374
+
375
+ def extractive_summarize(
376
+ self,
377
+ sentences: List[str],
378
+ ratio: float = 0.3,
379
+ method: str = "textrank"
380
+ ) -> List[str]:
381
+ """
382
+ Generate an extractive summary of the document.
383
+ Args:
384
+ sentences: List of sentences from the document
385
+ ratio: Percentage of sentences to keep (0.0-1.0)
386
+ method: Summarization method ('textrank', 'lexrank', or 'tfidf')
387
+ Returns:
388
+ List of extracted sentences forming the summary
389
+ """
390
+ if len(sentences) == 0:
391
+ return []
392
+
393
+ # Ensure we have a valid ratio
394
+ ratio = max(0.1, min(0.9, ratio))
395
+ num_sentences = max(1, int(len(sentences) * ratio))
396
+
397
+ if method == "textrank":
398
+ return self._textrank_summarize(sentences, num_sentences)
399
+ elif method == "lexrank":
400
+ return self._lexrank_summarize(sentences, num_sentences)
401
+ elif method == "tfidf":
402
+ return self._tfidf_summarize(sentences, num_sentences)
403
+ else:
404
+ raise ValueError(f"Unknown summarization method: {method}")
405
+
406
+ def _textrank_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
407
+ """
408
+ TextRank-based extractive summarization.
409
+ Args:
410
+ sentences: List of document sentences
411
+ num_sentences: Number of sentences to extract
412
+ Returns:
413
+ List of extracted sentences
414
+ """
415
+ # Compute sentence embeddings
416
+ embeddings = self.sentence_model.encode(sentences, convert_to_tensor=True)
417
+ embeddings = embeddings.cpu().numpy()
418
+
419
+ # Compute similarity matrix
420
+ sim_matrix = cosine_similarity(embeddings)
421
+
422
+ # Create graph and run PageRank
423
+ nx_graph = nx.from_numpy_array(sim_matrix)
424
+ scores = nx.pagerank(nx_graph)
425
+
426
+ # Sort sentences by score
427
+ ranked_sentences = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)
428
+
429
+ # Select top sentences and preserve original order
430
+ top_sentence_indices = sorted([item[2] for item in ranked_sentences[:num_sentences]])
431
+ return [sentences[i] for i in top_sentence_indices]
432
+
433
+ def _lexrank_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
434
+ """
435
+ LexRank-based extractive summarization.
436
+ Args:
437
+ sentences: List of document sentences
438
+ num_sentences: Number of sentences to extract
439
+ Returns:
440
+ List of extracted sentences
441
+ """
442
+ # Compute sentence embeddings
443
+ embeddings = self.sentence_model.encode(sentences, convert_to_tensor=True)
444
+ embeddings = embeddings.cpu().numpy()
445
+
446
+ # Compute similarity matrix
447
+ sim_matrix = cosine_similarity(embeddings)
448
+
449
+ # Apply threshold to create a binary similarity matrix
450
+ threshold = 0.3 # Can be tuned
451
+ sim_matrix_binary = (sim_matrix > threshold).astype(int)
452
+
453
+ # Normalize the matrix by row sums
454
+ row_sums = sim_matrix_binary.sum(axis=1, keepdims=True)
455
+ row_sums[row_sums == 0] = 1 # Avoid division by zero
456
+ transition_matrix = sim_matrix_binary / row_sums
457
+
458
+ # Apply power iteration to find the principal eigenvector
459
+ scores = np.ones(len(sentences)) / len(sentences)
460
+ epsilon = 1e-4
461
+ max_iter = 100
462
+ for _ in range(max_iter):
463
+ prev_scores = scores.copy()
464
+ scores = np.dot(transition_matrix.T, scores)
465
+ scores = scores / np.sum(scores)
466
+ if np.sum(np.abs(scores - prev_scores)) < epsilon:
467
+ break
468
+
469
+ # Rank sentences
470
+ ranked_indices = np.argsort(-scores)
471
+
472
+ # Select top sentences and preserve original order
473
+ top_sentence_indices = sorted(ranked_indices[:num_sentences])
474
+ return [sentences[i] for i in top_sentence_indices]
475
+
476
+ def _tfidf_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
477
+ """
478
+ TF-IDF based extractive summarization.
479
+ Args:
480
+ sentences: List of document sentences
481
+ num_sentences: Number of sentences to extract
482
+ Returns:
483
+ List of extracted sentences
484
+ """
485
+ # Handle the case where we have only one sentence
486
+ if len(sentences) <= 1:
487
+ return sentences
488
+
489
+ # Compute TF-IDF matrix
490
+ tfidf_matrix = self.tfidf_vectorizer.fit_transform(sentences)
491
+
492
+ # Compute document centroid
493
+ centroid = tfidf_matrix.mean(axis=0)
494
+
495
+ # Compute similarity of each sentence to centroid
496
+ similarities = []
497
+ for i in range(tfidf_matrix.shape[0]):
498
+ similarity = cosine_similarity(tfidf_matrix[i], centroid)[0][0]
499
+ similarities.append((i, similarity))
500
+
501
+ # Rank sentences
502
+ ranked_sentences = sorted(similarities, key=lambda x: x[1], reverse=True)
503
+
504
+ # Select top sentences and preserve original order
505
+ top_sentence_indices = sorted([idx for idx, _ in ranked_sentences[:num_sentences]])
506
+ return [sentences[i] for i in top_sentence_indices]
507
+
508
+ def abstractive_summarize(
509
+ self,
510
+ text: str,
511
+ max_length: int = 512,
512
+ min_length: int = 150,
513
+ num_beams: int = 4,
514
+ legal_context: bool = True
515
+ ) -> str:
516
+ """
517
+ Generate an abstractive summary of the document.
518
+ Args:
519
+ text: Text to summarize
520
+ max_length: Maximum length of the summary
521
+ min_length: Minimum length of the summary
522
+ num_beams: Number of beams to use for beam search
523
+ legal_context: Add legal domain context to input
524
+ Returns:
525
+ Abstractive summary as a string
526
+ """
527
+ # Truncate long text to model's maximum input length
528
+ input_max_length = self.abstractive_tokenizer.model_max_length - 100 # Leave room for summary
529
+
530
+ # Tokenize and truncate
531
+ input_ids = self.abstractive_tokenizer.encode(
532
+ text,
533
+ truncation=True,
534
+ max_length=input_max_length,
535
+ return_tensors="pt"
536
+ ).to(self.device)
537
+
538
+ # Add legal context if requested
539
+ prefix = "Summarize this legal document: " if legal_context else ""
540
+
541
+ # Generate summary
542
+ summary_ids = self.abstractive_model.generate(
543
+ input_ids,
544
+ max_length=max_length,
545
+ min_length=min_length,
546
+ num_beams=num_beams,
547
+ length_penalty=2.0,
548
+ early_stopping=True,
549
+ no_repeat_ngram_size=3
550
+ )
551
+
552
+ summary = self.abstractive_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
553
+ return summary
554
+
555
+ def section_based_summarization(
556
+ self,
557
+ document_structure: Dict[str, str],
558
+ method: str = "hybrid",
559
+ ratio: float = 0.3
560
+ ) -> Dict[str, str]:
561
+ """
562
+ Summarize each section of a document separately.
563
+ Args:
564
+ document_structure: Dictionary with section names as keys and section text as values
565
+ method: Summarization method ('extractive', 'abstractive', or 'hybrid')
566
+ ratio: Percentage of sentences to keep for extractive summarization
567
+ Returns:
568
+ Dictionary with section names as keys and summaries as values
569
+ """
570
+ section_summaries = {}
571
+ for section_name, section_text in document_structure.items():
572
+ # Skip empty sections or very short sections
573
+ if not section_text or len(section_text) < 100:
574
+ section_summaries[section_name] = section_text
575
+ continue
576
+
577
+ if method == "extractive":
578
+ sentences = section_text.split('. ')
579
+ sentences = [s + '.' for s in sentences if s]
580
+ summary = ' '.join(self.extractive_summarize(sentences, ratio))
581
+ elif method == "abstractive":
582
+ # For short sections, use the original text
583
+ if len(section_text) < 500:
584
+ summary = section_text
585
+ else:
586
+ summary = self.abstractive_summarize(
587
+ section_text,
588
+ max_length=min(512, max(150, len(section_text) // 3)),
589
+ min_length=min(100, max(50, len(section_text) // 5))
590
+ )
591
+ elif method == "hybrid":
592
+ # For longer sections, first extract important sentences, then generate abstractive summary
593
+ if len(section_text) < 500:
594
+ summary = section_text
595
+ else:
596
+ sentences = section_text.split('. ')
597
+ sentences = [s + '.' for s in sentences if s]
598
+ extracted_text = ' '.join(self.extractive_summarize(sentences, ratio=0.5))
599
+
600
+ # If the extracted text is still long, generate abstractive summary
601
+ if len(extracted_text) > 1000:
602
+ summary = self.abstractive_summarize(
603
+ extracted_text,
604
+ max_length=min(512, len(extracted_text) // 2),
605
+ min_length=min(150, len(extracted_text) // 4)
606
+ )
607
+ else:
608
+ summary = extracted_text
609
+ else:
610
+ raise ValueError(f"Unknown summarization method: {method}")
611
+
612
+ section_summaries[section_name] = summary
613
+
614
+ return section_summaries
615
+
616
+ def keyword_extraction(self, text: str, num_keywords: int = 10) -> List[str]:
617
+ """
618
+ Extract key legal terms and concepts from text.
619
+ Args:
620
+ text: Document text
621
+ num_keywords: Number of keywords to extract
622
+ Returns:
623
+ List of extracted keywords
624
+ """
625
+ # Fit and transform the text
626
+ tfidf_matrix = self.tfidf_vectorizer.fit_transform([text])
627
+
628
+ # Get feature names
629
+ feature_names = self.tfidf_vectorizer.get_feature_names_out()
630
+
631
+ # Get sorted indices of top-n features
632
+ indices = np.argsort(tfidf_matrix.toarray()[0])[-num_keywords:]
633
+
634
+ # Get top-n keywords
635
+ top_keywords = [feature_names[i] for i in indices]
636
+ return top_keywords[::-1] # Reverse to get highest score first
637
+
638
+ def highlight_key_sentences(
639
+ self,
640
+ text: str,
641
+ sentences: List[str],
642
+ num_highlights: int = 5
643
+ ) -> Dict[str, float]:
644
+ """
645
+ Identify and score key sentences for highlighting.
646
+ Args:
647
+ text: Full document text
648
+ sentences: List of sentences
649
+ num_highlights: Number of sentences to highlight
650
+ Returns:
651
+ Dictionary mapping sentences to their importance scores
652
+ """
653
+ # Handle case with very few sentences
654
+ if len(sentences) <= num_highlights:
655
+ return {s: 1.0 for s in sentences}
656
+
657
+ # Extract keywords
658
+ keywords = self.keyword_extraction(text, num_keywords=20)
659
+
660
+ # Initialize importance scores
661
+ scores = {}
662
+
663
+ # Score sentences based on position, length and keyword presence
664
+ for i, sentence in enumerate(sentences):
665
+ # Position score (earlier and later sentences tend to be more important)
666
+ position_score = 1.0
667
+ if i < len(sentences) * 0.2: # First 20%
668
+ position_score = 1.5
669
+ elif i > len(sentences) * 0.8: # Last 20%
670
+ position_score = 1.2
671
+
672
+ # Length score (avoid very short sentences)
673
+ length_score = min(1.0, len(sentence) / 100)
674
+
675
+ # Keyword score
676
+ keyword_score = 0
677
+ for keyword in keywords:
678
+ if keyword.lower() in sentence.lower():
679
+ keyword_score += 1
680
+ keyword_score = min(1.0, keyword_score / 5) # Normalize
681
+
682
+ # Combine scores
683
+ scores[sentence] = (position_score + length_score + keyword_score) / 3
684
+
685
+ # Sort by score and get top sentences
686
+ sorted_sentences = sorted(scores.items(), key=lambda x: x[1], reverse=True)
687
+ return dict(sorted_sentences[:num_highlights])
688
+
689
+ def generate_document_summary(
690
+ self,
691
+ text: str,
692
+ document_structure: Optional[Dict[str, str]] = None,
693
+ method: str = "hybrid",
694
+ ratio: float = 0.3,
695
+ include_keywords: bool = True
696
+ ) -> Dict:
697
+ """
698
+ Generate a comprehensive document summary.
699
+ Args:
700
+ text: Full document text
701
+ document_structure: Optional dictionary with section structure
702
+ method: Summarization method
703
+ ratio: Extractive summarization ratio
704
+ include_keywords: Whether to include keywords in the summary
705
+ Returns:
706
+ Dictionary containing summary information
707
+ """
708
+ result = {}
709
+
710
+ # Generate overall summary
711
+ if len(text) > 10000: # For very long documents, use hybrid approach
712
+ sentences = text.split('. ')
713
+ sentences = [s + '.' for s in sentences if s]
714
+ extracted_text = ' '.join(self.extractive_summarize(sentences, ratio=0.3))
715
+ result["overall_summary"] = self.abstractive_summarize(extracted_text, max_length=512)
716
+ else:
717
+ result["overall_summary"] = self.abstractive_summarize(text)
718
+
719
+ # Generate section summaries if structure is provided
720
+ if document_structure:
721
+ result["section_summaries"] = self.section_based_summarization(
722
+ document_structure,
723
+ method=method,
724
+ ratio=ratio
725
+ )
726
+
727
+ # Extract keywords
728
+ if include_keywords:
729
+ result["keywords"] = self.keyword_extraction(text, num_keywords=15)
730
+
731
+ # Highlight key sentences
732
+ sentences = text.split('. ')
733
+ sentences = [s + '.' for s in sentences if s and len(s) > 20] # Skip very short fragments
734
+ result["key_sentences"] = self.highlight_key_sentences(text, sentences)
735
+
736
+ return result
737
+
738
+ class LegalLongDocumentSummarizer:
739
+ """
740
+ A summarizer designed specifically for long legal documents,
741
+ using a divide-and-conquer approach with potential for fine-tuning.
742
+ """
743
+
744
+ def __init__(
745
+ self,
746
+ model_name: str = "facebook/bart-large-cnn",
747
+ max_chunk_length: int = 1024,
748
+ use_gpu: bool = torch.cuda.is_available()
749
+ ):
750
+ """
751
+ Initialize the long document summarizer.
752
+ Args:
753
+ model_name: Model name for the summarizer
754
+ max_chunk_length: Maximum token length for each chunk
755
+ use_gpu: Whether to use GPU for inference
756
+ """
757
+ self.device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
758
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
759
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
760
+ self.model.to(self.device)
761
+ self.max_chunk_length = max_chunk_length
762
+
763
+ def summarize_long_document(self, text: str, max_length: int = 512, min_length: int = 150) -> str:
764
+ """
765
+ Summarize a long legal document by dividing it into chunks.
766
+ Args:
767
+ text: Long document text
768
+ max_length: Maximum length of the summary
769
+ min_length: Minimum length of the summary
770
+ Returns:
771
+ Combined summary of all chunks
772
+ """
773
+ # Split the document into chunks
774
+ chunks = [text[i:i+self.max_chunk_length] for i in range(0, len(text), self.max_chunk_length)]
775
+
776
+ # Summarize each chunk
777
+ summaries = []
778
+ for chunk in chunks:
779
+ inputs = self.tokenizer(chunk, return_tensors="pt", truncation=True, max_length=self.max_chunk_length).to(self.device)
780
+ summary_ids = self.model.generate(
781
+ inputs['input_ids'],
782
+ max_length=max_length,
783
+ min_length=min_length,
784
+ length_penalty=2.0,
785
+ num_beams=4,
786
+ early_stopping=True
787
+ )
788
+ summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
789
+ summaries.append(summary)
790
+
791
+ # Combine summaries
792
+ combined_summary = ' '.join(summaries)
793
+ return combined_summary