Spaces:

Soundaryasos
/

Legaldocumentanalyzer

Build error

App Files Files Community

Soundaryasos commited on Mar 10, 2025

Commit

32cccf7

verified ·

1 Parent(s): 5053c88

Create app.py

Browse files

Files changed (1) hide show

app.py +793 -0

app.py ADDED Viewed

	@@ -0,0 +1,793 @@

+import re
+import spacy
+import pandas as pd
+from typing import List, Dict, Tuple, Optional
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
+import fitz  # PyMuPDF
+import docx
+from bs4 import BeautifulSoup
+import nltk
+from nltk.tokenize import sent_tokenize
+import numpy as np
+import torch
+import networkx as nx
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from sentence_transformers import SentenceTransformer
+# Download necessary NLTK data
+nltk.download('punkt')
+# Load legal-specific NLP model
+nlp = spacy.load("en_core_web_lg")
+class LegalDocumentProcessor:
+    """
+    A comprehensive pipeline for processing legal documents.
+    Handles document loading, text extraction, preprocessing, and tokenization.
+    """
+    def __init__(self, tokenizer_name: str = "nlpaueb/legal-bert-base-uncased"):
+        """
+        Initialize the legal document processor.
+        Args:
+            tokenizer_name: The HuggingFace tokenizer to use for transformer models
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        # Legal-specific patterns
+        self.legal_abbreviations = {
+            "et al.": "and others",
+            "i.e.": "that is",
+            "e.g.": "for example",
+            "v.": "versus",
+            "cf.": "compare",
+            "viz.": "namely",
+            "ex rel.": "on behalf of",
+            "etc.": "etcetera"
+        }
+        # Regular expressions for legal citations and references
+        self.citation_pattern = re.compile(r'\d+\s+[A-Za-z\.]+\s+\d+')
+        self.section_pattern = re.compile(r'Section\s+\d+\.\d+', re.IGNORECASE)
+        # Legal boilerplate text patterns
+        self.boilerplate_patterns = [
+            r"IN WITNESS WHEREOF.*",
+            r"WHEREAS,.*",
+            r"NOW, THEREFORE,.*",
+            r"The parties hereby agree as follows:.*"
+        ]
+        self.boilerplate_regex = re.compile('|'.join(self.boilerplate_patterns), re.DOTALL)
+    def extract_text_from_file(self, file_path: str) -> str:
+        """
+        Extract text from various file formats (PDF, DOCX, TXT, HTML).
+        Args:
+            file_path: Path to the legal document file
+        Returns:
+            Extracted text as a string
+        """
+        file_extension = file_path.split('.')[-1].lower()
+        if file_extension == 'pdf':
+            return self._extract_from_pdf(file_path)
+        elif file_extension in ['docx', 'doc']:
+            return self._extract_from_docx(file_path)
+        elif file_extension == 'txt':
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        elif file_extension in ['html', 'htm']:
+            return self._extract_from_html(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {file_extension}")
+    def _extract_from_pdf(self, file_path: str) -> str:
+        """Extract text from PDF files"""
+        doc = fitz.open(file_path)
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        return text
+    def _extract_from_docx(self, file_path: str) -> str:
+        """Extract text from DOCX files"""
+        doc = docx.Document(file_path)
+        return '\n'.join([para.text for para in doc.paragraphs])
+    def _extract_from_html(self, file_path: str) -> str:
+        """Extract text from HTML files"""
+        with open(file_path, 'r', encoding='utf-8') as f:
+            soup = BeautifulSoup(f.read(), 'html.parser')
+            return soup.get_text()
+    def preprocess_text(self, text: str) -> str:
+        """
+        Preprocess legal text by:
+        - Expanding abbreviations
+        - Removing redundant whitespace
+        - Handling special characters
+        - Maintaining sentence structure
+        Args:
+            text: Raw text extracted from a legal document
+        Returns:
+            Preprocessed text
+        """
+        # Replace legal abbreviations
+        for abbr, expansion in self.legal_abbreviations.items():
+            text = re.sub(r'\b' + re.escape(abbr) + r'\b', expansion, text)
+        # Remove redundant whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Separate citation references to prevent them from merging with sentences
+        text = re.sub(self.citation_pattern, r' \g<0> ', text)
+        # Handle section references
+        text = re.sub(self.section_pattern, r' \g<0> ', text)
+        # Normalize newlines to separate sections properly
+        text = re.sub(r'\n+', '\n', text)
+        return text.strip()
+    def identify_document_structure(self, text: str) -> Dict[str, List[str]]:
+        """
+        Identify key structural elements in the legal document.
+        Args:
+            text: Preprocessed legal document text
+        Returns:
+            Dictionary containing identified sections
+        """
+        # Split into sections based on headers
+        sections = {}
+        # Identify potential headers (uppercase text followed by newline)
+        potential_headers = re.finditer(r'([A-Z][A-Z\s]+[A-Z])[:\.\n]', text)
+        # Extract sections based on identified headers
+        last_pos = 0
+        last_header = "PREAMBLE"
+        for match in potential_headers:
+            header = match.group(1).strip()
+            start_pos = match.start()
+            # Add the previous section
+            if last_pos < start_pos:
+                sections[last_header] = text[last_pos:start_pos].strip()
+            last_pos = match.end()
+            last_header = header
+        # Add the final section
+        if last_pos < len(text):
+            sections[last_header] = text[last_pos:].strip()
+        return sections
+    def extract_sentences(self, text: str) -> List[str]:
+        """
+        Split text into sentences, handling legal-specific patterns.
+        Args:
+            text: Preprocessed legal document text
+        Returns:
+            List of sentences
+        """
+        # Use NLTK's sentence tokenizer as a base
+        sentences = sent_tokenize(text)
+        # Post-process to handle potential issues with legal text
+        processed_sentences = []
+        for sentence in sentences:
+            # Skip empty sentences
+            if not sentence.strip():
+                continue
+            # Clean up sentences
+            sentence = sentence.strip()
+            # Check if sentence is too long (might be incorrectly split)
+            if len(sentence) > 500:
+                # Try to break it further at punctuation marks
+                sub_sentences = re.split(r'[;:](?=\s)', sentence)
+                processed_sentences.extend([s.strip() for s in sub_sentences if s.strip()])
+            else:
+                processed_sentences.append(sentence)
+        return processed_sentences
+    def tokenize_for_transformer(self, text: str, max_length: int = 512) -> Dict:
+        """
+        Tokenize text for transformer-based models.
+        Args:
+            text: Input text to tokenize
+            max_length: Maximum token length for the model
+        Returns:
+            Tokenized input dict ready for transformer models
+        """
+        return self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt"
+        )
+    def extract_entities(self, text: str) -> List[Dict]:
+        """
+        Extract legal entities from text using spaCy.
+        Args:
+            text: Legal document text
+        Returns:
+            List of extracted entities with type information
+        """
+        doc = nlp(text)
+        entities = []
+        for ent in doc.ents:
+            entities.append({
+                "text": ent.text,
+                "start": ent.start_char,
+                "end": ent.end_char,
+                "type": ent.label_
+            })
+        # Additional legal entity extraction for common patterns
+        # Extract case citations
+        case_citations = re.finditer(r'[A-Za-z\s]+ v\. [A-Za-z\s]+,?\s+\d+\s+[A-Za-z\.]+\s+\d+', text)
+        for match in case_citations:
+            entities.append({
+                "text": match.group(0),
+                "start": match.start(),
+                "end": match.end(),
+                "type": "CASE_CITATION"
+            })
+        # Extract statutory references
+        statutes = re.finditer(r'\d+\s+U\.S\.C\.\s+§\s+\d+', text)
+        for match in statutes:
+            entities.append({
+                "text": match.group(0),
+                "start": match.start(),
+                "end": match.end(),
+                "type": "STATUTE"
+            })
+        return entities
+    def chunk_document(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
+        """
+        Split document into overlapping chunks for processing.
+        Args:
+            text: Document text
+            chunk_size: Approximate size of each chunk in characters
+            overlap: Number of characters to overlap between chunks
+        Returns:
+            List of document chunks
+        """
+        # First split by sentences
+        sentences = self.extract_sentences(text)
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for sentence in sentences:
+            sentence_length = len(sentence)
+            # If adding this sentence would exceed chunk size
+            if current_length + sentence_length > chunk_size and current_chunk:
+                # Add the current chunk to our list of chunks
+                chunks.append(' '.join(current_chunk))
+                # Start a new chunk with overlap
+                # Find sentences to keep for overlap
+                overlap_chars = 0
+                overlap_sentences = []
+                for s in reversed(current_chunk):
+                    if overlap_chars + len(s) <= overlap:
+                        overlap_sentences.insert(0, s)
+                        overlap_chars += len(s) + 1  # +1 for the space
+                    else:
+                        break
+                current_chunk = overlap_sentences
+                current_length = overlap_chars
+            current_chunk.append(sentence)
+            current_length += sentence_length + 1  # +1 for the space
+        # Don't forget the last chunk
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
+    def process_document(self, file_path: str) -> Dict:
+        """
+        Complete processing pipeline for a legal document.
+        Args:
+            file_path: Path to the legal document
+        Returns:
+            Dictionary containing processed document information
+        """
+        # Extract text from file
+        raw_text = self.extract_text_from_file(file_path)
+        # Preprocess the text
+        preprocessed_text = self.preprocess_text(raw_text)
+        # Identify document structure
+        structure = self.identify_document_structure(preprocessed_text)
+        # Extract sentences
+        sentences = self.extract_sentences(preprocessed_text)
+        # Chunk document for processing
+        chunks = self.chunk_document(preprocessed_text)
+        # Extract entities
+        entities = self.extract_entities(preprocessed_text)
+        return {
+            "raw_text": raw_text,
+            "preprocessed_text": preprocessed_text,
+            "structure": structure,
+            "sentences": sentences,
+            "chunks": chunks,
+            "entities": entities
+        }
+class LegalSummarizer:
+    """
+    A comprehensive summarization engine for legal documents that implements
+    both extractive and abstractive summarization techniques.
+    """
+    def __init__(
+        self,
+        extractive_model: str = "sentence-transformers/all-MiniLM-L6-v2",
+        abstractive_model: str = "facebook/bart-large-cnn",
+        use_gpu: bool = torch.cuda.is_available()
+    ):
+        """
+        Initialize the legal summarization engine.
+        Args:
+            extractive_model: Model name for sentence embeddings (extractive)
+            abstractive_model: Model name for seq2seq summarization (abstractive)
+            use_gpu: Whether to use GPU for inference
+        """
+        self.device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
+        # Load models
+        print(f"Loading extractive model: {extractive_model}")
+        self.sentence_model = SentenceTransformer(extractive_model)
+        self.sentence_model.to(self.device)
+        print(f"Loading abstractive model: {abstractive_model}")
+        self.abstractive_tokenizer = AutoTokenizer.from_pretrained(abstractive_model)
+        self.abstractive_model = AutoModelForSeq2SeqLM.from_pretrained(abstractive_model)
+        self.abstractive_model.to(self.device)
+        # Initialize TF-IDF vectorizer for keyword extraction
+        self.tfidf_vectorizer = TfidfVectorizer(
+            max_features=5000,
+            stop_words='english',
+            ngram_range=(1, 2)
+        )
+    def extractive_summarize(
+        self,
+        sentences: List[str],
+        ratio: float = 0.3,
+        method: str = "textrank"
+    ) -> List[str]:
+        """
+        Generate an extractive summary of the document.
+        Args:
+            sentences: List of sentences from the document
+            ratio: Percentage of sentences to keep (0.0-1.0)
+            method: Summarization method ('textrank', 'lexrank', or 'tfidf')
+        Returns:
+            List of extracted sentences forming the summary
+        """
+        if len(sentences) == 0:
+            return []
+        # Ensure we have a valid ratio
+        ratio = max(0.1, min(0.9, ratio))
+        num_sentences = max(1, int(len(sentences) * ratio))
+        if method == "textrank":
+            return self._textrank_summarize(sentences, num_sentences)
+        elif method == "lexrank":
+            return self._lexrank_summarize(sentences, num_sentences)
+        elif method == "tfidf":
+            return self._tfidf_summarize(sentences, num_sentences)
+        else:
+            raise ValueError(f"Unknown summarization method: {method}")
+    def _textrank_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
+        """
+        TextRank-based extractive summarization.
+        Args:
+            sentences: List of document sentences
+            num_sentences: Number of sentences to extract
+        Returns:
+            List of extracted sentences
+        """
+        # Compute sentence embeddings
+        embeddings = self.sentence_model.encode(sentences, convert_to_tensor=True)
+        embeddings = embeddings.cpu().numpy()
+        # Compute similarity matrix
+        sim_matrix = cosine_similarity(embeddings)
+        # Create graph and run PageRank
+        nx_graph = nx.from_numpy_array(sim_matrix)
+        scores = nx.pagerank(nx_graph)
+        # Sort sentences by score
+        ranked_sentences = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)
+        # Select top sentences and preserve original order
+        top_sentence_indices = sorted([item[2] for item in ranked_sentences[:num_sentences]])
+        return [sentences[i] for i in top_sentence_indices]
+    def _lexrank_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
+        """
+        LexRank-based extractive summarization.
+        Args:
+            sentences: List of document sentences
+            num_sentences: Number of sentences to extract
+        Returns:
+            List of extracted sentences
+        """
+        # Compute sentence embeddings
+        embeddings = self.sentence_model.encode(sentences, convert_to_tensor=True)
+        embeddings = embeddings.cpu().numpy()
+        # Compute similarity matrix
+        sim_matrix = cosine_similarity(embeddings)
+        # Apply threshold to create a binary similarity matrix
+        threshold = 0.3  # Can be tuned
+        sim_matrix_binary = (sim_matrix > threshold).astype(int)
+        # Normalize the matrix by row sums
+        row_sums = sim_matrix_binary.sum(axis=1, keepdims=True)
+        row_sums[row_sums == 0] = 1  # Avoid division by zero
+        transition_matrix = sim_matrix_binary / row_sums
+        # Apply power iteration to find the principal eigenvector
+        scores = np.ones(len(sentences)) / len(sentences)
+        epsilon = 1e-4
+        max_iter = 100
+        for _ in range(max_iter):
+            prev_scores = scores.copy()
+            scores = np.dot(transition_matrix.T, scores)
+            scores = scores / np.sum(scores)
+            if np.sum(np.abs(scores - prev_scores)) < epsilon:
+                break
+        # Rank sentences
+        ranked_indices = np.argsort(-scores)
+        # Select top sentences and preserve original order
+        top_sentence_indices = sorted(ranked_indices[:num_sentences])
+        return [sentences[i] for i in top_sentence_indices]
+    def _tfidf_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
+        """
+        TF-IDF based extractive summarization.
+        Args:
+            sentences: List of document sentences
+            num_sentences: Number of sentences to extract
+        Returns:
+            List of extracted sentences
+        """
+        # Handle the case where we have only one sentence
+        if len(sentences) <= 1:
+            return sentences
+        # Compute TF-IDF matrix
+        tfidf_matrix = self.tfidf_vectorizer.fit_transform(sentences)
+        # Compute document centroid
+        centroid = tfidf_matrix.mean(axis=0)
+        # Compute similarity of each sentence to centroid
+        similarities = []
+        for i in range(tfidf_matrix.shape[0]):
+            similarity = cosine_similarity(tfidf_matrix[i], centroid)[0][0]
+            similarities.append((i, similarity))
+        # Rank sentences
+        ranked_sentences = sorted(similarities, key=lambda x: x[1], reverse=True)
+        # Select top sentences and preserve original order
+        top_sentence_indices = sorted([idx for idx, _ in ranked_sentences[:num_sentences]])
+        return [sentences[i] for i in top_sentence_indices]
+    def abstractive_summarize(
+        self,
+        text: str,
+        max_length: int = 512,
+        min_length: int = 150,
+        num_beams: int = 4,
+        legal_context: bool = True
+    ) -> str:
+        """
+        Generate an abstractive summary of the document.
+        Args:
+            text: Text to summarize
+            max_length: Maximum length of the summary
+            min_length: Minimum length of the summary
+            num_beams: Number of beams to use for beam search
+            legal_context: Add legal domain context to input
+        Returns:
+            Abstractive summary as a string
+        """
+        # Truncate long text to model's maximum input length
+        input_max_length = self.abstractive_tokenizer.model_max_length - 100  # Leave room for summary
+        # Tokenize and truncate
+        input_ids = self.abstractive_tokenizer.encode(
+            text,
+            truncation=True,
+            max_length=input_max_length,
+            return_tensors="pt"
+        ).to(self.device)
+        # Add legal context if requested
+        prefix = "Summarize this legal document: " if legal_context else ""
+        # Generate summary
+        summary_ids = self.abstractive_model.generate(
+            input_ids,
+            max_length=max_length,
+            min_length=min_length,
+            num_beams=num_beams,
+            length_penalty=2.0,
+            early_stopping=True,
+            no_repeat_ngram_size=3
+        )
+        summary = self.abstractive_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        return summary
+    def section_based_summarization(
+        self,
+        document_structure: Dict[str, str],
+        method: str = "hybrid",
+        ratio: float = 0.3
+    ) -> Dict[str, str]:
+        """
+        Summarize each section of a document separately.
+        Args:
+            document_structure: Dictionary with section names as keys and section text as values
+            method: Summarization method ('extractive', 'abstractive', or 'hybrid')
+            ratio: Percentage of sentences to keep for extractive summarization
+        Returns:
+            Dictionary with section names as keys and summaries as values
+        """
+        section_summaries = {}
+        for section_name, section_text in document_structure.items():
+            # Skip empty sections or very short sections
+            if not section_text or len(section_text) < 100:
+                section_summaries[section_name] = section_text
+                continue
+            if method == "extractive":
+                sentences = section_text.split('. ')
+                sentences = [s + '.' for s in sentences if s]
+                summary = ' '.join(self.extractive_summarize(sentences, ratio))
+            elif method == "abstractive":
+                # For short sections, use the original text
+                if len(section_text) < 500:
+                    summary = section_text
+                else:
+                    summary = self.abstractive_summarize(
+                        section_text,
+                        max_length=min(512, max(150, len(section_text) // 3)),
+                        min_length=min(100, max(50, len(section_text) // 5))
+                    )
+            elif method == "hybrid":
+                # For longer sections, first extract important sentences, then generate abstractive summary
+                if len(section_text) < 500:
+                    summary = section_text
+                else:
+                    sentences = section_text.split('. ')
+                    sentences = [s + '.' for s in sentences if s]
+                    extracted_text = ' '.join(self.extractive_summarize(sentences, ratio=0.5))
+                    # If the extracted text is still long, generate abstractive summary
+                    if len(extracted_text) > 1000:
+                        summary = self.abstractive_summarize(
+                            extracted_text,
+                            max_length=min(512, len(extracted_text) // 2),
+                            min_length=min(150, len(extracted_text) // 4)
+                        )
+                    else:
+                        summary = extracted_text
+            else:
+                raise ValueError(f"Unknown summarization method: {method}")
+            section_summaries[section_name] = summary
+        return section_summaries
+    def keyword_extraction(self, text: str, num_keywords: int = 10) -> List[str]:
+        """
+        Extract key legal terms and concepts from text.
+        Args:
+            text: Document text
+            num_keywords: Number of keywords to extract
+        Returns:
+            List of extracted keywords
+        """
+        # Fit and transform the text
+        tfidf_matrix = self.tfidf_vectorizer.fit_transform([text])
+        # Get feature names
+        feature_names = self.tfidf_vectorizer.get_feature_names_out()
+        # Get sorted indices of top-n features
+        indices = np.argsort(tfidf_matrix.toarray()[0])[-num_keywords:]
+        # Get top-n keywords
+        top_keywords = [feature_names[i] for i in indices]
+        return top_keywords[::-1]  # Reverse to get highest score first
+    def highlight_key_sentences(
+        self,
+        text: str,
+        sentences: List[str],
+        num_highlights: int = 5
+    ) -> Dict[str, float]:
+        """
+        Identify and score key sentences for highlighting.
+        Args:
+            text: Full document text
+            sentences: List of sentences
+            num_highlights: Number of sentences to highlight
+        Returns:
+            Dictionary mapping sentences to their importance scores
+        """
+        # Handle case with very few sentences
+        if len(sentences) <= num_highlights:
+            return {s: 1.0 for s in sentences}
+        # Extract keywords
+        keywords = self.keyword_extraction(text, num_keywords=20)
+        # Initialize importance scores
+        scores = {}
+        # Score sentences based on position, length and keyword presence
+        for i, sentence in enumerate(sentences):
+            # Position score (earlier and later sentences tend to be more important)
+            position_score = 1.0
+            if i < len(sentences) * 0.2:  # First 20%
+                position_score = 1.5
+            elif i > len(sentences) * 0.8:  # Last 20%
+                position_score = 1.2
+            # Length score (avoid very short sentences)
+            length_score = min(1.0, len(sentence) / 100)
+            # Keyword score
+            keyword_score = 0
+            for keyword in keywords:
+                if keyword.lower() in sentence.lower():
+                    keyword_score += 1
+            keyword_score = min(1.0, keyword_score / 5)  # Normalize
+            # Combine scores
+            scores[sentence] = (position_score + length_score + keyword_score) / 3
+        # Sort by score and get top sentences
+        sorted_sentences = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+        return dict(sorted_sentences[:num_highlights])
+    def generate_document_summary(
+        self,
+        text: str,
+        document_structure: Optional[Dict[str, str]] = None,
+        method: str = "hybrid",
+        ratio: float = 0.3,
+        include_keywords: bool = True
+    ) -> Dict:
+        """
+        Generate a comprehensive document summary.
+        Args:
+            text: Full document text
+            document_structure: Optional dictionary with section structure
+            method: Summarization method
+            ratio: Extractive summarization ratio
+            include_keywords: Whether to include keywords in the summary
+        Returns:
+            Dictionary containing summary information
+        """
+        result = {}
+        # Generate overall summary
+        if len(text) > 10000:  # For very long documents, use hybrid approach
+            sentences = text.split('. ')
+            sentences = [s + '.' for s in sentences if s]
+            extracted_text = ' '.join(self.extractive_summarize(sentences, ratio=0.3))
+            result["overall_summary"] = self.abstractive_summarize(extracted_text, max_length=512)
+        else:
+            result["overall_summary"] = self.abstractive_summarize(text)
+        # Generate section summaries if structure is provided
+        if document_structure:
+            result["section_summaries"] = self.section_based_summarization(
+                document_structure,
+                method=method,
+                ratio=ratio
+            )
+        # Extract keywords
+        if include_keywords:
+            result["keywords"] = self.keyword_extraction(text, num_keywords=15)
+        # Highlight key sentences
+        sentences = text.split('. ')
+        sentences = [s + '.' for s in sentences if s and len(s) > 20]  # Skip very short fragments
+        result["key_sentences"] = self.highlight_key_sentences(text, sentences)
+        return result
+class LegalLongDocumentSummarizer:
+    """
+    A summarizer designed specifically for long legal documents,
+    using a divide-and-conquer approach with potential for fine-tuning.
+    """
+    def __init__(
+        self,
+        model_name: str = "facebook/bart-large-cnn",
+        max_chunk_length: int = 1024,
+        use_gpu: bool = torch.cuda.is_available()
+    ):
+        """
+        Initialize the long document summarizer.
+        Args:
+            model_name: Model name for the summarizer
+            max_chunk_length: Maximum token length for each chunk
+            use_gpu: Whether to use GPU for inference
+        """
+        self.device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        self.model.to(self.device)
+        self.max_chunk_length = max_chunk_length
+    def summarize_long_document(self, text: str, max_length: int = 512, min_length: int = 150) -> str:
+        """
+        Summarize a long legal document by dividing it into chunks.
+        Args:
+            text: Long document text
+            max_length: Maximum length of the summary
+            min_length: Minimum length of the summary
+        Returns:
+            Combined summary of all chunks
+        """
+        # Split the document into chunks
+        chunks = [text[i:i+self.max_chunk_length] for i in range(0, len(text), self.max_chunk_length)]
+        # Summarize each chunk
+        summaries = []
+        for chunk in chunks:
+            inputs = self.tokenizer(chunk, return_tensors="pt", truncation=True, max_length=self.max_chunk_length).to(self.device)
+            summary_ids = self.model.generate(
+                inputs['input_ids'],
+                max_length=max_length,
+                min_length=min_length,
+                length_penalty=2.0,
+                num_beams=4,
+                early_stopping=True
+            )
+            summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+            summaries.append(summary)
+        # Combine summaries
+        combined_summary = ' '.join(summaries)
+        return combined_summary