Soundaryasos commited on
Commit
af8f925
·
verified ·
1 Parent(s): fddb56a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -783
app.py CHANGED
@@ -1,793 +1,104 @@
1
- import re
2
- import spacy
3
- import pandas as pd
4
- from typing import List, Dict, Tuple, Optional
5
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
6
  import fitz # PyMuPDF
7
  import docx
8
- from bs4 import BeautifulSoup
9
  import nltk
10
- from nltk.tokenize import sent_tokenize
11
- import numpy as np
12
- import torch
13
  import networkx as nx
14
- from sklearn.feature_extraction.text import TfidfVectorizer
15
- from sklearn.metrics.pairwise import cosine_similarity
16
- from sentence_transformers import SentenceTransformer
17
 
18
- # Download necessary NLTK data
19
- nltk.download('punkt')
20
-
21
- # Load legal-specific NLP model
22
  nlp = spacy.load("en_core_web_lg")
23
-
24
- class LegalDocumentProcessor:
25
- """
26
- A comprehensive pipeline for processing legal documents.
27
- Handles document loading, text extraction, preprocessing, and tokenization.
28
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- def __init__(self, tokenizer_name: str = "nlpaueb/legal-bert-base-uncased"):
31
- """
32
- Initialize the legal document processor.
33
- Args:
34
- tokenizer_name: The HuggingFace tokenizer to use for transformer models
35
- """
36
- self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
37
-
38
- # Legal-specific patterns
39
- self.legal_abbreviations = {
40
- "et al.": "and others",
41
- "i.e.": "that is",
42
- "e.g.": "for example",
43
- "v.": "versus",
44
- "cf.": "compare",
45
- "viz.": "namely",
46
- "ex rel.": "on behalf of",
47
- "etc.": "etcetera"
48
- }
49
-
50
- # Regular expressions for legal citations and references
51
- self.citation_pattern = re.compile(r'\d+\s+[A-Za-z\.]+\s+\d+')
52
- self.section_pattern = re.compile(r'Section\s+\d+\.\d+', re.IGNORECASE)
53
-
54
- # Legal boilerplate text patterns
55
- self.boilerplate_patterns = [
56
- r"IN WITNESS WHEREOF.*",
57
- r"WHEREAS,.*",
58
- r"NOW, THEREFORE,.*",
59
- r"The parties hereby agree as follows:.*"
60
- ]
61
- self.boilerplate_regex = re.compile('|'.join(self.boilerplate_patterns), re.DOTALL)
62
-
63
- def extract_text_from_file(self, file_path: str) -> str:
64
- """
65
- Extract text from various file formats (PDF, DOCX, TXT, HTML).
66
- Args:
67
- file_path: Path to the legal document file
68
- Returns:
69
- Extracted text as a string
70
- """
71
- file_extension = file_path.split('.')[-1].lower()
72
- if file_extension == 'pdf':
73
- return self._extract_from_pdf(file_path)
74
- elif file_extension in ['docx', 'doc']:
75
- return self._extract_from_docx(file_path)
76
- elif file_extension == 'txt':
77
- with open(file_path, 'r', encoding='utf-8') as f:
78
- return f.read()
79
- elif file_extension in ['html', 'htm']:
80
- return self._extract_from_html(file_path)
81
- else:
82
- raise ValueError(f"Unsupported file format: {file_extension}")
83
-
84
- def _extract_from_pdf(self, file_path: str) -> str:
85
- """Extract text from PDF files"""
86
- doc = fitz.open(file_path)
87
- text = ""
88
- for page in doc:
89
- text += page.get_text()
90
- return text
91
-
92
- def _extract_from_docx(self, file_path: str) -> str:
93
- """Extract text from DOCX files"""
94
- doc = docx.Document(file_path)
95
- return '\n'.join([para.text for para in doc.paragraphs])
96
-
97
- def _extract_from_html(self, file_path: str) -> str:
98
- """Extract text from HTML files"""
99
- with open(file_path, 'r', encoding='utf-8') as f:
100
- soup = BeautifulSoup(f.read(), 'html.parser')
101
- return soup.get_text()
102
-
103
- def preprocess_text(self, text: str) -> str:
104
- """
105
- Preprocess legal text by:
106
- - Expanding abbreviations
107
- - Removing redundant whitespace
108
- - Handling special characters
109
- - Maintaining sentence structure
110
- Args:
111
- text: Raw text extracted from a legal document
112
- Returns:
113
- Preprocessed text
114
- """
115
- # Replace legal abbreviations
116
- for abbr, expansion in self.legal_abbreviations.items():
117
- text = re.sub(r'\b' + re.escape(abbr) + r'\b', expansion, text)
118
-
119
- # Remove redundant whitespace
120
- text = re.sub(r'\s+', ' ', text)
121
-
122
- # Separate citation references to prevent them from merging with sentences
123
- text = re.sub(self.citation_pattern, r' \g<0> ', text)
124
-
125
- # Handle section references
126
- text = re.sub(self.section_pattern, r' \g<0> ', text)
127
-
128
- # Normalize newlines to separate sections properly
129
- text = re.sub(r'\n+', '\n', text)
130
-
131
- return text.strip()
132
-
133
- def identify_document_structure(self, text: str) -> Dict[str, List[str]]:
134
- """
135
- Identify key structural elements in the legal document.
136
- Args:
137
- text: Preprocessed legal document text
138
- Returns:
139
- Dictionary containing identified sections
140
- """
141
- # Split into sections based on headers
142
- sections = {}
143
-
144
- # Identify potential headers (uppercase text followed by newline)
145
- potential_headers = re.finditer(r'([A-Z][A-Z\s]+[A-Z])[:\.\n]', text)
146
-
147
- # Extract sections based on identified headers
148
- last_pos = 0
149
- last_header = "PREAMBLE"
150
- for match in potential_headers:
151
- header = match.group(1).strip()
152
- start_pos = match.start()
153
-
154
- # Add the previous section
155
- if last_pos < start_pos:
156
- sections[last_header] = text[last_pos:start_pos].strip()
157
-
158
- last_pos = match.end()
159
- last_header = header
160
-
161
- # Add the final section
162
- if last_pos < len(text):
163
- sections[last_header] = text[last_pos:].strip()
164
-
165
- return sections
166
-
167
- def extract_sentences(self, text: str) -> List[str]:
168
- """
169
- Split text into sentences, handling legal-specific patterns.
170
- Args:
171
- text: Preprocessed legal document text
172
- Returns:
173
- List of sentences
174
- """
175
- # Use NLTK's sentence tokenizer as a base
176
- sentences = sent_tokenize(text)
177
-
178
- # Post-process to handle potential issues with legal text
179
- processed_sentences = []
180
- for sentence in sentences:
181
- # Skip empty sentences
182
- if not sentence.strip():
183
- continue
184
-
185
- # Clean up sentences
186
- sentence = sentence.strip()
187
-
188
- # Check if sentence is too long (might be incorrectly split)
189
- if len(sentence) > 500:
190
- # Try to break it further at punctuation marks
191
- sub_sentences = re.split(r'[;:](?=\s)', sentence)
192
- processed_sentences.extend([s.strip() for s in sub_sentences if s.strip()])
193
- else:
194
- processed_sentences.append(sentence)
195
-
196
- return processed_sentences
197
-
198
- def tokenize_for_transformer(self, text: str, max_length: int = 512) -> Dict:
199
- """
200
- Tokenize text for transformer-based models.
201
- Args:
202
- text: Input text to tokenize
203
- max_length: Maximum token length for the model
204
- Returns:
205
- Tokenized input dict ready for transformer models
206
- """
207
- return self.tokenizer(
208
- text,
209
- padding="max_length",
210
- truncation=True,
211
- max_length=max_length,
212
- return_tensors="pt"
213
- )
214
-
215
- def extract_entities(self, text: str) -> List[Dict]:
216
- """
217
- Extract legal entities from text using spaCy.
218
- Args:
219
- text: Legal document text
220
- Returns:
221
- List of extracted entities with type information
222
- """
223
- doc = nlp(text)
224
- entities = []
225
- for ent in doc.ents:
226
- entities.append({
227
- "text": ent.text,
228
- "start": ent.start_char,
229
- "end": ent.end_char,
230
- "type": ent.label_
231
- })
232
-
233
- # Additional legal entity extraction for common patterns
234
- # Extract case citations
235
- case_citations = re.finditer(r'[A-Za-z\s]+ v\. [A-Za-z\s]+,?\s+\d+\s+[A-Za-z\.]+\s+\d+', text)
236
- for match in case_citations:
237
- entities.append({
238
- "text": match.group(0),
239
- "start": match.start(),
240
- "end": match.end(),
241
- "type": "CASE_CITATION"
242
- })
243
-
244
- # Extract statutory references
245
- statutes = re.finditer(r'\d+\s+U\.S\.C\.\s+§\s+\d+', text)
246
- for match in statutes:
247
- entities.append({
248
- "text": match.group(0),
249
- "start": match.start(),
250
- "end": match.end(),
251
- "type": "STATUTE"
252
- })
253
-
254
- return entities
255
-
256
- def chunk_document(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
257
- """
258
- Split document into overlapping chunks for processing.
259
- Args:
260
- text: Document text
261
- chunk_size: Approximate size of each chunk in characters
262
- overlap: Number of characters to overlap between chunks
263
- Returns:
264
- List of document chunks
265
- """
266
- # First split by sentences
267
- sentences = self.extract_sentences(text)
268
- chunks = []
269
- current_chunk = []
270
- current_length = 0
271
- for sentence in sentences:
272
- sentence_length = len(sentence)
273
-
274
- # If adding this sentence would exceed chunk size
275
- if current_length + sentence_length > chunk_size and current_chunk:
276
- # Add the current chunk to our list of chunks
277
- chunks.append(' '.join(current_chunk))
278
-
279
- # Start a new chunk with overlap
280
- # Find sentences to keep for overlap
281
- overlap_chars = 0
282
- overlap_sentences = []
283
- for s in reversed(current_chunk):
284
- if overlap_chars + len(s) <= overlap:
285
- overlap_sentences.insert(0, s)
286
- overlap_chars += len(s) + 1 # +1 for the space
287
- else:
288
- break
289
-
290
- current_chunk = overlap_sentences
291
- current_length = overlap_chars
292
-
293
- current_chunk.append(sentence)
294
- current_length += sentence_length + 1 # +1 for the space
295
-
296
- # Don't forget the last chunk
297
- if current_chunk:
298
- chunks.append(' '.join(current_chunk))
299
-
300
- return chunks
301
-
302
- def process_document(self, file_path: str) -> Dict:
303
- """
304
- Complete processing pipeline for a legal document.
305
- Args:
306
- file_path: Path to the legal document
307
- Returns:
308
- Dictionary containing processed document information
309
- """
310
- # Extract text from file
311
- raw_text = self.extract_text_from_file(file_path)
312
-
313
- # Preprocess the text
314
- preprocessed_text = self.preprocess_text(raw_text)
315
-
316
- # Identify document structure
317
- structure = self.identify_document_structure(preprocessed_text)
318
-
319
- # Extract sentences
320
- sentences = self.extract_sentences(preprocessed_text)
321
-
322
- # Chunk document for processing
323
- chunks = self.chunk_document(preprocessed_text)
324
-
325
- # Extract entities
326
- entities = self.extract_entities(preprocessed_text)
327
-
328
- return {
329
- "raw_text": raw_text,
330
- "preprocessed_text": preprocessed_text,
331
- "structure": structure,
332
- "sentences": sentences,
333
- "chunks": chunks,
334
- "entities": entities
335
- }
336
-
337
- class LegalSummarizer:
338
- """
339
- A comprehensive summarization engine for legal documents that implements
340
- both extractive and abstractive summarization techniques.
341
- """
342
 
343
- def __init__(
344
- self,
345
- extractive_model: str = "sentence-transformers/all-MiniLM-L6-v2",
346
- abstractive_model: str = "facebook/bart-large-cnn",
347
- use_gpu: bool = torch.cuda.is_available()
348
- ):
349
- """
350
- Initialize the legal summarization engine.
351
- Args:
352
- extractive_model: Model name for sentence embeddings (extractive)
353
- abstractive_model: Model name for seq2seq summarization (abstractive)
354
- use_gpu: Whether to use GPU for inference
355
- """
356
- self.device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
357
-
358
- # Load models
359
- print(f"Loading extractive model: {extractive_model}")
360
- self.sentence_model = SentenceTransformer(extractive_model)
361
- self.sentence_model.to(self.device)
362
-
363
- print(f"Loading abstractive model: {abstractive_model}")
364
- self.abstractive_tokenizer = AutoTokenizer.from_pretrained(abstractive_model)
365
- self.abstractive_model = AutoModelForSeq2SeqLM.from_pretrained(abstractive_model)
366
- self.abstractive_model.to(self.device)
367
-
368
- # Initialize TF-IDF vectorizer for keyword extraction
369
- self.tfidf_vectorizer = TfidfVectorizer(
370
- max_features=5000,
371
- stop_words='english',
372
- ngram_range=(1, 2)
373
- )
374
-
375
- def extractive_summarize(
376
- self,
377
- sentences: List[str],
378
- ratio: float = 0.3,
379
- method: str = "textrank"
380
- ) -> List[str]:
381
- """
382
- Generate an extractive summary of the document.
383
- Args:
384
- sentences: List of sentences from the document
385
- ratio: Percentage of sentences to keep (0.0-1.0)
386
- method: Summarization method ('textrank', 'lexrank', or 'tfidf')
387
- Returns:
388
- List of extracted sentences forming the summary
389
- """
390
- if len(sentences) == 0:
391
- return []
392
-
393
- # Ensure we have a valid ratio
394
- ratio = max(0.1, min(0.9, ratio))
395
- num_sentences = max(1, int(len(sentences) * ratio))
396
-
397
- if method == "textrank":
398
- return self._textrank_summarize(sentences, num_sentences)
399
- elif method == "lexrank":
400
- return self._lexrank_summarize(sentences, num_sentences)
401
- elif method == "tfidf":
402
- return self._tfidf_summarize(sentences, num_sentences)
403
- else:
404
- raise ValueError(f"Unknown summarization method: {method}")
405
-
406
- def _textrank_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
407
- """
408
- TextRank-based extractive summarization.
409
- Args:
410
- sentences: List of document sentences
411
- num_sentences: Number of sentences to extract
412
- Returns:
413
- List of extracted sentences
414
- """
415
- # Compute sentence embeddings
416
- embeddings = self.sentence_model.encode(sentences, convert_to_tensor=True)
417
- embeddings = embeddings.cpu().numpy()
418
-
419
- # Compute similarity matrix
420
- sim_matrix = cosine_similarity(embeddings)
421
-
422
- # Create graph and run PageRank
423
- nx_graph = nx.from_numpy_array(sim_matrix)
424
- scores = nx.pagerank(nx_graph)
425
-
426
- # Sort sentences by score
427
- ranked_sentences = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)
428
-
429
- # Select top sentences and preserve original order
430
- top_sentence_indices = sorted([item[2] for item in ranked_sentences[:num_sentences]])
431
- return [sentences[i] for i in top_sentence_indices]
432
-
433
- def _lexrank_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
434
- """
435
- LexRank-based extractive summarization.
436
- Args:
437
- sentences: List of document sentences
438
- num_sentences: Number of sentences to extract
439
- Returns:
440
- List of extracted sentences
441
- """
442
- # Compute sentence embeddings
443
- embeddings = self.sentence_model.encode(sentences, convert_to_tensor=True)
444
- embeddings = embeddings.cpu().numpy()
445
-
446
- # Compute similarity matrix
447
- sim_matrix = cosine_similarity(embeddings)
448
-
449
- # Apply threshold to create a binary similarity matrix
450
- threshold = 0.3 # Can be tuned
451
- sim_matrix_binary = (sim_matrix > threshold).astype(int)
452
-
453
- # Normalize the matrix by row sums
454
- row_sums = sim_matrix_binary.sum(axis=1, keepdims=True)
455
- row_sums[row_sums == 0] = 1 # Avoid division by zero
456
- transition_matrix = sim_matrix_binary / row_sums
457
-
458
- # Apply power iteration to find the principal eigenvector
459
- scores = np.ones(len(sentences)) / len(sentences)
460
- epsilon = 1e-4
461
- max_iter = 100
462
- for _ in range(max_iter):
463
- prev_scores = scores.copy()
464
- scores = np.dot(transition_matrix.T, scores)
465
- scores = scores / np.sum(scores)
466
- if np.sum(np.abs(scores - prev_scores)) < epsilon:
467
- break
468
-
469
- # Rank sentences
470
- ranked_indices = np.argsort(-scores)
471
-
472
- # Select top sentences and preserve original order
473
- top_sentence_indices = sorted(ranked_indices[:num_sentences])
474
- return [sentences[i] for i in top_sentence_indices]
475
-
476
- def _tfidf_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
477
- """
478
- TF-IDF based extractive summarization.
479
- Args:
480
- sentences: List of document sentences
481
- num_sentences: Number of sentences to extract
482
- Returns:
483
- List of extracted sentences
484
- """
485
- # Handle the case where we have only one sentence
486
- if len(sentences) <= 1:
487
- return sentences
488
-
489
- # Compute TF-IDF matrix
490
- tfidf_matrix = self.tfidf_vectorizer.fit_transform(sentences)
491
-
492
- # Compute document centroid
493
- centroid = tfidf_matrix.mean(axis=0)
494
-
495
- # Compute similarity of each sentence to centroid
496
- similarities = []
497
- for i in range(tfidf_matrix.shape[0]):
498
- similarity = cosine_similarity(tfidf_matrix[i], centroid)[0][0]
499
- similarities.append((i, similarity))
500
-
501
- # Rank sentences
502
- ranked_sentences = sorted(similarities, key=lambda x: x[1], reverse=True)
503
-
504
- # Select top sentences and preserve original order
505
- top_sentence_indices = sorted([idx for idx, _ in ranked_sentences[:num_sentences]])
506
- return [sentences[i] for i in top_sentence_indices]
507
-
508
- def abstractive_summarize(
509
- self,
510
- text: str,
511
- max_length: int = 512,
512
- min_length: int = 150,
513
- num_beams: int = 4,
514
- legal_context: bool = True
515
- ) -> str:
516
- """
517
- Generate an abstractive summary of the document.
518
- Args:
519
- text: Text to summarize
520
- max_length: Maximum length of the summary
521
- min_length: Minimum length of the summary
522
- num_beams: Number of beams to use for beam search
523
- legal_context: Add legal domain context to input
524
- Returns:
525
- Abstractive summary as a string
526
- """
527
- # Truncate long text to model's maximum input length
528
- input_max_length = self.abstractive_tokenizer.model_max_length - 100 # Leave room for summary
529
-
530
- # Tokenize and truncate
531
- input_ids = self.abstractive_tokenizer.encode(
532
- text,
533
- truncation=True,
534
- max_length=input_max_length,
535
- return_tensors="pt"
536
- ).to(self.device)
537
-
538
- # Add legal context if requested
539
- prefix = "Summarize this legal document: " if legal_context else ""
540
-
541
- # Generate summary
542
- summary_ids = self.abstractive_model.generate(
543
- input_ids,
544
- max_length=max_length,
545
- min_length=min_length,
546
- num_beams=num_beams,
547
- length_penalty=2.0,
548
- early_stopping=True,
549
- no_repeat_ngram_size=3
550
- )
551
-
552
- summary = self.abstractive_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
553
- return summary
554
-
555
- def section_based_summarization(
556
- self,
557
- document_structure: Dict[str, str],
558
- method: str = "hybrid",
559
- ratio: float = 0.3
560
- ) -> Dict[str, str]:
561
- """
562
- Summarize each section of a document separately.
563
- Args:
564
- document_structure: Dictionary with section names as keys and section text as values
565
- method: Summarization method ('extractive', 'abstractive', or 'hybrid')
566
- ratio: Percentage of sentences to keep for extractive summarization
567
- Returns:
568
- Dictionary with section names as keys and summaries as values
569
- """
570
- section_summaries = {}
571
- for section_name, section_text in document_structure.items():
572
- # Skip empty sections or very short sections
573
- if not section_text or len(section_text) < 100:
574
- section_summaries[section_name] = section_text
575
- continue
576
-
577
- if method == "extractive":
578
- sentences = section_text.split('. ')
579
- sentences = [s + '.' for s in sentences if s]
580
- summary = ' '.join(self.extractive_summarize(sentences, ratio))
581
- elif method == "abstractive":
582
- # For short sections, use the original text
583
- if len(section_text) < 500:
584
- summary = section_text
585
- else:
586
- summary = self.abstractive_summarize(
587
- section_text,
588
- max_length=min(512, max(150, len(section_text) // 3)),
589
- min_length=min(100, max(50, len(section_text) // 5))
590
- )
591
- elif method == "hybrid":
592
- # For longer sections, first extract important sentences, then generate abstractive summary
593
- if len(section_text) < 500:
594
- summary = section_text
595
- else:
596
- sentences = section_text.split('. ')
597
- sentences = [s + '.' for s in sentences if s]
598
- extracted_text = ' '.join(self.extractive_summarize(sentences, ratio=0.5))
599
-
600
- # If the extracted text is still long, generate abstractive summary
601
- if len(extracted_text) > 1000:
602
- summary = self.abstractive_summarize(
603
- extracted_text,
604
- max_length=min(512, len(extracted_text) // 2),
605
- min_length=min(150, len(extracted_text) // 4)
606
- )
607
- else:
608
- summary = extracted_text
609
- else:
610
- raise ValueError(f"Unknown summarization method: {method}")
611
-
612
- section_summaries[section_name] = summary
613
-
614
- return section_summaries
615
-
616
- def keyword_extraction(self, text: str, num_keywords: int = 10) -> List[str]:
617
- """
618
- Extract key legal terms and concepts from text.
619
- Args:
620
- text: Document text
621
- num_keywords: Number of keywords to extract
622
- Returns:
623
- List of extracted keywords
624
- """
625
- # Fit and transform the text
626
- tfidf_matrix = self.tfidf_vectorizer.fit_transform([text])
627
-
628
- # Get feature names
629
- feature_names = self.tfidf_vectorizer.get_feature_names_out()
630
-
631
- # Get sorted indices of top-n features
632
- indices = np.argsort(tfidf_matrix.toarray()[0])[-num_keywords:]
633
-
634
- # Get top-n keywords
635
- top_keywords = [feature_names[i] for i in indices]
636
- return top_keywords[::-1] # Reverse to get highest score first
637
-
638
- def highlight_key_sentences(
639
- self,
640
- text: str,
641
- sentences: List[str],
642
- num_highlights: int = 5
643
- ) -> Dict[str, float]:
644
- """
645
- Identify and score key sentences for highlighting.
646
- Args:
647
- text: Full document text
648
- sentences: List of sentences
649
- num_highlights: Number of sentences to highlight
650
- Returns:
651
- Dictionary mapping sentences to their importance scores
652
- """
653
- # Handle case with very few sentences
654
- if len(sentences) <= num_highlights:
655
- return {s: 1.0 for s in sentences}
656
-
657
- # Extract keywords
658
- keywords = self.keyword_extraction(text, num_keywords=20)
659
-
660
- # Initialize importance scores
661
- scores = {}
662
-
663
- # Score sentences based on position, length and keyword presence
664
- for i, sentence in enumerate(sentences):
665
- # Position score (earlier and later sentences tend to be more important)
666
- position_score = 1.0
667
- if i < len(sentences) * 0.2: # First 20%
668
- position_score = 1.5
669
- elif i > len(sentences) * 0.8: # Last 20%
670
- position_score = 1.2
671
-
672
- # Length score (avoid very short sentences)
673
- length_score = min(1.0, len(sentence) / 100)
674
-
675
- # Keyword score
676
- keyword_score = 0
677
- for keyword in keywords:
678
- if keyword.lower() in sentence.lower():
679
- keyword_score += 1
680
- keyword_score = min(1.0, keyword_score / 5) # Normalize
681
-
682
- # Combine scores
683
- scores[sentence] = (position_score + length_score + keyword_score) / 3
684
-
685
- # Sort by score and get top sentences
686
- sorted_sentences = sorted(scores.items(), key=lambda x: x[1], reverse=True)
687
- return dict(sorted_sentences[:num_highlights])
688
-
689
- def generate_document_summary(
690
- self,
691
- text: str,
692
- document_structure: Optional[Dict[str, str]] = None,
693
- method: str = "hybrid",
694
- ratio: float = 0.3,
695
- include_keywords: bool = True
696
- ) -> Dict:
697
- """
698
- Generate a comprehensive document summary.
699
- Args:
700
- text: Full document text
701
- document_structure: Optional dictionary with section structure
702
- method: Summarization method
703
- ratio: Extractive summarization ratio
704
- include_keywords: Whether to include keywords in the summary
705
- Returns:
706
- Dictionary containing summary information
707
- """
708
- result = {}
709
-
710
- # Generate overall summary
711
- if len(text) > 10000: # For very long documents, use hybrid approach
712
- sentences = text.split('. ')
713
- sentences = [s + '.' for s in sentences if s]
714
- extracted_text = ' '.join(self.extractive_summarize(sentences, ratio=0.3))
715
- result["overall_summary"] = self.abstractive_summarize(extracted_text, max_length=512)
716
- else:
717
- result["overall_summary"] = self.abstractive_summarize(text)
718
-
719
- # Generate section summaries if structure is provided
720
- if document_structure:
721
- result["section_summaries"] = self.section_based_summarization(
722
- document_structure,
723
- method=method,
724
- ratio=ratio
725
- )
726
-
727
- # Extract keywords
728
- if include_keywords:
729
- result["keywords"] = self.keyword_extraction(text, num_keywords=15)
730
-
731
- # Highlight key sentences
732
- sentences = text.split('. ')
733
- sentences = [s + '.' for s in sentences if s and len(s) > 20] # Skip very short fragments
734
- result["key_sentences"] = self.highlight_key_sentences(text, sentences)
735
-
736
- return result
737
-
738
- class LegalLongDocumentSummarizer:
739
- """
740
- A summarizer designed specifically for long legal documents,
741
- using a divide-and-conquer approach with potential for fine-tuning.
742
- """
743
 
744
- def __init__(
745
- self,
746
- model_name: str = "facebook/bart-large-cnn",
747
- max_chunk_length: int = 1024,
748
- use_gpu: bool = torch.cuda.is_available()
749
- ):
750
- """
751
- Initialize the long document summarizer.
752
- Args:
753
- model_name: Model name for the summarizer
754
- max_chunk_length: Maximum token length for each chunk
755
- use_gpu: Whether to use GPU for inference
756
- """
757
- self.device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
758
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
759
- self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
760
- self.model.to(self.device)
761
- self.max_chunk_length = max_chunk_length
762
 
763
- def summarize_long_document(self, text: str, max_length: int = 512, min_length: int = 150) -> str:
764
- """
765
- Summarize a long legal document by dividing it into chunks.
766
- Args:
767
- text: Long document text
768
- max_length: Maximum length of the summary
769
- min_length: Minimum length of the summary
770
- Returns:
771
- Combined summary of all chunks
772
- """
773
- # Split the document into chunks
774
- chunks = [text[i:i+self.max_chunk_length] for i in range(0, len(text), self.max_chunk_length)]
775
-
776
- # Summarize each chunk
777
- summaries = []
778
- for chunk in chunks:
779
- inputs = self.tokenizer(chunk, return_tensors="pt", truncation=True, max_length=self.max_chunk_length).to(self.device)
780
- summary_ids = self.model.generate(
781
- inputs['input_ids'],
782
- max_length=max_length,
783
- min_length=min_length,
784
- length_penalty=2.0,
785
- num_beams=4,
786
- early_stopping=True
787
- )
788
- summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
789
- summaries.append(summary)
790
-
791
- # Combine summaries
792
- combined_summary = ' '.join(summaries)
793
- return combined_summary
 
1
+ import streamlit as st
 
 
 
 
2
  import fitz # PyMuPDF
3
  import docx
 
4
  import nltk
5
+ import spacy
 
 
6
  import networkx as nx
7
+ import matplotlib.pyplot as plt
8
+ from transformers import pipeline
9
+ from collections import Counter
10
 
11
+ # Load NLP Models
12
+ nltk.download("punkt")
 
 
13
  nlp = spacy.load("en_core_web_lg")
14
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
15
+ qa_pipeline = pipeline("question-answering")
16
+
17
+ # Function to extract text from PDF
18
+ def extract_text_from_pdf(pdf_file):
19
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
20
+ text = "\n".join([page.get_text("text") for page in doc])
21
+ return text
22
+
23
+ # Function to extract text from DOCX
24
+ def extract_text_from_docx(docx_file):
25
+ doc = docx.Document(docx_file)
26
+ text = "\n".join([para.text for para in doc.paragraphs])
27
+ return text
28
+
29
+ # Summarization function
30
+ def summarize_text(text):
31
+ return summarizer(text, max_length=200, min_length=50, do_sample=False)[0]["summary_text"]
32
+
33
+ # Q&A Function
34
+ def answer_question(text, question):
35
+ return qa_pipeline({"context": text, "question": question})["answer"]
36
+
37
+ # Named Entity Recognition (NER)
38
+ def extract_entities(text):
39
+ doc = nlp(text)
40
+ entities = [(ent.text, ent.label_) for ent in doc.ents]
41
+ return entities
42
+
43
+ # Generate Mind Map
44
+ def generate_mind_map(text):
45
+ doc = nlp(text)
46
+ entity_counts = Counter([ent.text for ent in doc.ents])
47
 
48
+ G = nx.Graph()
49
+ for entity, count in entity_counts.items():
50
+ G.add_node(entity, size=count * 100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ pos = nx.spring_layout(G)
53
+ plt.figure(figsize=(10, 7))
54
+ nx.draw(G, pos, with_labels=True, node_size=[G.nodes[n]['size'] for n in G.nodes], node_color="skyblue")
55
+ plt.title("Mind Map of Entities")
56
+ st.pyplot(plt)
57
+
58
+ # Streamlit UI
59
+ st.set_page_config(page_title="Legal Document Summarizer & Query System", layout="wide")
60
+ st.title("📜 Legal Document Summarization, NER & Mind Map System")
61
+ st.markdown("""Upload a legal document, get a summary, extract entities, and generate a mind map!""")
62
+
63
+ # File uploader
64
+ uploaded_file = st.file_uploader("Upload a PDF or DOCX", type=["pdf", "docx"])
65
+
66
+ if uploaded_file:
67
+ if uploaded_file.type == "application/pdf":
68
+ document_text = extract_text_from_pdf(uploaded_file)
69
+ elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
70
+ document_text = extract_text_from_docx(uploaded_file)
71
+ else:
72
+ st.error("Unsupported file format!")
73
+ st.stop()
74
+
75
+ st.subheader("Extracted Text Preview")
76
+ st.text_area("Document Content", document_text[:2000], height=250)
77
+
78
+ # Summarization
79
+ if st.button("Summarize Document"):
80
+ summary = summarize_text(document_text)
81
+ st.subheader("📌 Summary")
82
+ st.success(summary)
83
+
84
+ # Question Answering
85
+ user_question = st.text_input("Ask a question about the document:")
86
+ if user_question:
87
+ answer = answer_question(document_text, user_question)
88
+ st.subheader("📝 Answer")
89
+ st.info(answer)
90
+
91
+ # Named Entity Recognition
92
+ if st.button("Extract Entities"):
93
+ entities = extract_entities(document_text)
94
+ st.subheader("📌 Named Entities")
95
+ for entity, label in entities:
96
+ st.write(f"**{entity}** - {label}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ # Mind Map Generation
99
+ if st.button("Generate Mind Map"):
100
+ st.subheader("🧠 Mind Map of Entities")
101
+ generate_mind_map(document_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ st.markdown("---")
104
+ st.caption("🚀 Built with Hugging Face, spaCy, and Streamlit")