Spaces:

Soundaryasos
/

Legaldocumentanalyzer

Build error

App Files Files Community

Soundaryasos commited on Mar 10, 2025

Commit

af8f925

verified ·

1 Parent(s): fddb56a

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -783

app.py CHANGED Viewed

@@ -1,793 +1,104 @@
-import re
-import spacy
-import pandas as pd
-from typing import List, Dict, Tuple, Optional
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
 import fitz  # PyMuPDF
 import docx
-from bs4 import BeautifulSoup
 import nltk
-from nltk.tokenize import sent_tokenize
-import numpy as np
-import torch
 import networkx as nx
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from sentence_transformers import SentenceTransformer
-# Download necessary NLTK data
-nltk.download('punkt')
-# Load legal-specific NLP model
 nlp = spacy.load("en_core_web_lg")
-class LegalDocumentProcessor:
-    """
-    A comprehensive pipeline for processing legal documents.
-    Handles document loading, text extraction, preprocessing, and tokenization.
-    """
-    def __init__(self, tokenizer_name: str = "nlpaueb/legal-bert-base-uncased"):
-        """
-        Initialize the legal document processor.
-        Args:
-            tokenizer_name: The HuggingFace tokenizer to use for transformer models
-        """
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-        # Legal-specific patterns
-        self.legal_abbreviations = {
-            "et al.": "and others",
-            "i.e.": "that is",
-            "e.g.": "for example",
-            "v.": "versus",
-            "cf.": "compare",
-            "viz.": "namely",
-            "ex rel.": "on behalf of",
-            "etc.": "etcetera"
-        }
-        # Regular expressions for legal citations and references
-        self.citation_pattern = re.compile(r'\d+\s+[A-Za-z\.]+\s+\d+')
-        self.section_pattern = re.compile(r'Section\s+\d+\.\d+', re.IGNORECASE)
-        # Legal boilerplate text patterns
-        self.boilerplate_patterns = [
-            r"IN WITNESS WHEREOF.*",
-            r"WHEREAS,.*",
-            r"NOW, THEREFORE,.*",
-            r"The parties hereby agree as follows:.*"
-        ]
-        self.boilerplate_regex = re.compile('|'.join(self.boilerplate_patterns), re.DOTALL)
-    def extract_text_from_file(self, file_path: str) -> str:
-        """
-        Extract text from various file formats (PDF, DOCX, TXT, HTML).
-        Args:
-            file_path: Path to the legal document file
-        Returns:
-            Extracted text as a string
-        """
-        file_extension = file_path.split('.')[-1].lower()
-        if file_extension == 'pdf':
-            return self._extract_from_pdf(file_path)
-        elif file_extension in ['docx', 'doc']:
-            return self._extract_from_docx(file_path)
-        elif file_extension == 'txt':
-            with open(file_path, 'r', encoding='utf-8') as f:
-                return f.read()
-        elif file_extension in ['html', 'htm']:
-            return self._extract_from_html(file_path)
-        else:
-            raise ValueError(f"Unsupported file format: {file_extension}")
-    def _extract_from_pdf(self, file_path: str) -> str:
-        """Extract text from PDF files"""
-        doc = fitz.open(file_path)
-        text = ""
-        for page in doc:
-            text += page.get_text()
-        return text
-    def _extract_from_docx(self, file_path: str) -> str:
-        """Extract text from DOCX files"""
-        doc = docx.Document(file_path)
-        return '\n'.join([para.text for para in doc.paragraphs])
-    def _extract_from_html(self, file_path: str) -> str:
-        """Extract text from HTML files"""
-        with open(file_path, 'r', encoding='utf-8') as f:
-            soup = BeautifulSoup(f.read(), 'html.parser')
-            return soup.get_text()
-    def preprocess_text(self, text: str) -> str:
-        """
-        Preprocess legal text by:
-        - Expanding abbreviations
-        - Removing redundant whitespace
-        - Handling special characters
-        - Maintaining sentence structure
-        Args:
-            text: Raw text extracted from a legal document
-        Returns:
-            Preprocessed text
-        """
-        # Replace legal abbreviations
-        for abbr, expansion in self.legal_abbreviations.items():
-            text = re.sub(r'\b' + re.escape(abbr) + r'\b', expansion, text)
-        # Remove redundant whitespace
-        text = re.sub(r'\s+', ' ', text)
-        # Separate citation references to prevent them from merging with sentences
-        text = re.sub(self.citation_pattern, r' \g<0> ', text)
-        # Handle section references
-        text = re.sub(self.section_pattern, r' \g<0> ', text)
-        # Normalize newlines to separate sections properly
-        text = re.sub(r'\n+', '\n', text)
-        return text.strip()
-    def identify_document_structure(self, text: str) -> Dict[str, List[str]]:
-        """
-        Identify key structural elements in the legal document.
-        Args:
-            text: Preprocessed legal document text
-        Returns:
-            Dictionary containing identified sections
-        """
-        # Split into sections based on headers
-        sections = {}
-        # Identify potential headers (uppercase text followed by newline)
-        potential_headers = re.finditer(r'([A-Z][A-Z\s]+[A-Z])[:\.\n]', text)
-        # Extract sections based on identified headers
-        last_pos = 0
-        last_header = "PREAMBLE"
-        for match in potential_headers:
-            header = match.group(1).strip()
-            start_pos = match.start()
-            # Add the previous section
-            if last_pos < start_pos:
-                sections[last_header] = text[last_pos:start_pos].strip()
-            last_pos = match.end()
-            last_header = header
-        # Add the final section
-        if last_pos < len(text):
-            sections[last_header] = text[last_pos:].strip()
-        return sections
-    def extract_sentences(self, text: str) -> List[str]:
-        """
-        Split text into sentences, handling legal-specific patterns.
-        Args:
-            text: Preprocessed legal document text
-        Returns:
-            List of sentences
-        """
-        # Use NLTK's sentence tokenizer as a base
-        sentences = sent_tokenize(text)
-        # Post-process to handle potential issues with legal text
-        processed_sentences = []
-        for sentence in sentences:
-            # Skip empty sentences
-            if not sentence.strip():
-                continue
-            # Clean up sentences
-            sentence = sentence.strip()
-            # Check if sentence is too long (might be incorrectly split)
-            if len(sentence) > 500:
-                # Try to break it further at punctuation marks
-                sub_sentences = re.split(r'[;:](?=\s)', sentence)
-                processed_sentences.extend([s.strip() for s in sub_sentences if s.strip()])
-            else:
-                processed_sentences.append(sentence)
-        return processed_sentences
-    def tokenize_for_transformer(self, text: str, max_length: int = 512) -> Dict:
-        """
-        Tokenize text for transformer-based models.
-        Args:
-            text: Input text to tokenize
-            max_length: Maximum token length for the model
-        Returns:
-            Tokenized input dict ready for transformer models
-        """
-        return self.tokenizer(
-            text,
-            padding="max_length",
-            truncation=True,
-            max_length=max_length,
-            return_tensors="pt"
-        )
-    def extract_entities(self, text: str) -> List[Dict]:
-        """
-        Extract legal entities from text using spaCy.
-        Args:
-            text: Legal document text
-        Returns:
-            List of extracted entities with type information
-        """
-        doc = nlp(text)
-        entities = []
-        for ent in doc.ents:
-            entities.append({
-                "text": ent.text,
-                "start": ent.start_char,
-                "end": ent.end_char,
-                "type": ent.label_
-            })
-        # Additional legal entity extraction for common patterns
-        # Extract case citations
-        case_citations = re.finditer(r'[A-Za-z\s]+ v\. [A-Za-z\s]+,?\s+\d+\s+[A-Za-z\.]+\s+\d+', text)
-        for match in case_citations:
-            entities.append({
-                "text": match.group(0),
-                "start": match.start(),
-                "end": match.end(),
-                "type": "CASE_CITATION"
-            })
-        # Extract statutory references
-        statutes = re.finditer(r'\d+\s+U\.S\.C\.\s+§\s+\d+', text)
-        for match in statutes:
-            entities.append({
-                "text": match.group(0),
-                "start": match.start(),
-                "end": match.end(),
-                "type": "STATUTE"
-            })
-        return entities
-    def chunk_document(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
-        """
-        Split document into overlapping chunks for processing.
-        Args:
-            text: Document text
-            chunk_size: Approximate size of each chunk in characters
-            overlap: Number of characters to overlap between chunks
-        Returns:
-            List of document chunks
-        """
-        # First split by sentences
-        sentences = self.extract_sentences(text)
-        chunks = []
-        current_chunk = []
-        current_length = 0
-        for sentence in sentences:
-            sentence_length = len(sentence)
-            # If adding this sentence would exceed chunk size
-            if current_length + sentence_length > chunk_size and current_chunk:
-                # Add the current chunk to our list of chunks
-                chunks.append(' '.join(current_chunk))
-                # Start a new chunk with overlap
-                # Find sentences to keep for overlap
-                overlap_chars = 0
-                overlap_sentences = []
-                for s in reversed(current_chunk):
-                    if overlap_chars + len(s) <= overlap:
-                        overlap_sentences.insert(0, s)
-                        overlap_chars += len(s) + 1  # +1 for the space
-                    else:
-                        break
-                current_chunk = overlap_sentences
-                current_length = overlap_chars
-            current_chunk.append(sentence)
-            current_length += sentence_length + 1  # +1 for the space
-        # Don't forget the last chunk
-        if current_chunk:
-            chunks.append(' '.join(current_chunk))
-        return chunks
-    def process_document(self, file_path: str) -> Dict:
-        """
-        Complete processing pipeline for a legal document.
-        Args:
-            file_path: Path to the legal document
-        Returns:
-            Dictionary containing processed document information
-        """
-        # Extract text from file
-        raw_text = self.extract_text_from_file(file_path)
-        # Preprocess the text
-        preprocessed_text = self.preprocess_text(raw_text)
-        # Identify document structure
-        structure = self.identify_document_structure(preprocessed_text)
-        # Extract sentences
-        sentences = self.extract_sentences(preprocessed_text)
-        # Chunk document for processing
-        chunks = self.chunk_document(preprocessed_text)
-        # Extract entities
-        entities = self.extract_entities(preprocessed_text)
-        return {
-            "raw_text": raw_text,
-            "preprocessed_text": preprocessed_text,
-            "structure": structure,
-            "sentences": sentences,
-            "chunks": chunks,
-            "entities": entities
-        }
-class LegalSummarizer:
-    """
-    A comprehensive summarization engine for legal documents that implements
-    both extractive and abstractive summarization techniques.
-    """
-    def __init__(
-        self,
-        extractive_model: str = "sentence-transformers/all-MiniLM-L6-v2",
-        abstractive_model: str = "facebook/bart-large-cnn",
-        use_gpu: bool = torch.cuda.is_available()
-    ):
-        """
-        Initialize the legal summarization engine.
-        Args:
-            extractive_model: Model name for sentence embeddings (extractive)
-            abstractive_model: Model name for seq2seq summarization (abstractive)
-            use_gpu: Whether to use GPU for inference
-        """
-        self.device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
-        # Load models
-        print(f"Loading extractive model: {extractive_model}")
-        self.sentence_model = SentenceTransformer(extractive_model)
-        self.sentence_model.to(self.device)
-        print(f"Loading abstractive model: {abstractive_model}")
-        self.abstractive_tokenizer = AutoTokenizer.from_pretrained(abstractive_model)
-        self.abstractive_model = AutoModelForSeq2SeqLM.from_pretrained(abstractive_model)
-        self.abstractive_model.to(self.device)
-        # Initialize TF-IDF vectorizer for keyword extraction
-        self.tfidf_vectorizer = TfidfVectorizer(
-            max_features=5000,
-            stop_words='english',
-            ngram_range=(1, 2)
-        )
-    def extractive_summarize(
-        self,
-        sentences: List[str],
-        ratio: float = 0.3,
-        method: str = "textrank"
-    ) -> List[str]:
-        """
-        Generate an extractive summary of the document.
-        Args:
-            sentences: List of sentences from the document
-            ratio: Percentage of sentences to keep (0.0-1.0)
-            method: Summarization method ('textrank', 'lexrank', or 'tfidf')
-        Returns:
-            List of extracted sentences forming the summary
-        """
-        if len(sentences) == 0:
-            return []
-        # Ensure we have a valid ratio
-        ratio = max(0.1, min(0.9, ratio))
-        num_sentences = max(1, int(len(sentences) * ratio))
-        if method == "textrank":
-            return self._textrank_summarize(sentences, num_sentences)
-        elif method == "lexrank":
-            return self._lexrank_summarize(sentences, num_sentences)
-        elif method == "tfidf":
-            return self._tfidf_summarize(sentences, num_sentences)
-        else:
-            raise ValueError(f"Unknown summarization method: {method}")
-    def _textrank_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
-        """
-        TextRank-based extractive summarization.
-        Args:
-            sentences: List of document sentences
-            num_sentences: Number of sentences to extract
-        Returns:
-            List of extracted sentences
-        """
-        # Compute sentence embeddings
-        embeddings = self.sentence_model.encode(sentences, convert_to_tensor=True)
-        embeddings = embeddings.cpu().numpy()
-        # Compute similarity matrix
-        sim_matrix = cosine_similarity(embeddings)
-        # Create graph and run PageRank
-        nx_graph = nx.from_numpy_array(sim_matrix)
-        scores = nx.pagerank(nx_graph)
-        # Sort sentences by score
-        ranked_sentences = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)
-        # Select top sentences and preserve original order
-        top_sentence_indices = sorted([item[2] for item in ranked_sentences[:num_sentences]])
-        return [sentences[i] for i in top_sentence_indices]
-    def _lexrank_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
-        """
-        LexRank-based extractive summarization.
-        Args:
-            sentences: List of document sentences
-            num_sentences: Number of sentences to extract
-        Returns:
-            List of extracted sentences
-        """
-        # Compute sentence embeddings
-        embeddings = self.sentence_model.encode(sentences, convert_to_tensor=True)
-        embeddings = embeddings.cpu().numpy()
-        # Compute similarity matrix
-        sim_matrix = cosine_similarity(embeddings)
-        # Apply threshold to create a binary similarity matrix
-        threshold = 0.3  # Can be tuned
-        sim_matrix_binary = (sim_matrix > threshold).astype(int)
-        # Normalize the matrix by row sums
-        row_sums = sim_matrix_binary.sum(axis=1, keepdims=True)
-        row_sums[row_sums == 0] = 1  # Avoid division by zero
-        transition_matrix = sim_matrix_binary / row_sums
-        # Apply power iteration to find the principal eigenvector
-        scores = np.ones(len(sentences)) / len(sentences)
-        epsilon = 1e-4
-        max_iter = 100
-        for _ in range(max_iter):
-            prev_scores = scores.copy()
-            scores = np.dot(transition_matrix.T, scores)
-            scores = scores / np.sum(scores)
-            if np.sum(np.abs(scores - prev_scores)) < epsilon:
-                break
-        # Rank sentences
-        ranked_indices = np.argsort(-scores)
-        # Select top sentences and preserve original order
-        top_sentence_indices = sorted(ranked_indices[:num_sentences])
-        return [sentences[i] for i in top_sentence_indices]
-    def _tfidf_summarize(self, sentences: List[str], num_sentences: int) -> List[str]:
-        """
-        TF-IDF based extractive summarization.
-        Args:
-            sentences: List of document sentences
-            num_sentences: Number of sentences to extract
-        Returns:
-            List of extracted sentences
-        """
-        # Handle the case where we have only one sentence
-        if len(sentences) <= 1:
-            return sentences
-        # Compute TF-IDF matrix
-        tfidf_matrix = self.tfidf_vectorizer.fit_transform(sentences)
-        # Compute document centroid
-        centroid = tfidf_matrix.mean(axis=0)
-        # Compute similarity of each sentence to centroid
-        similarities = []
-        for i in range(tfidf_matrix.shape[0]):
-            similarity = cosine_similarity(tfidf_matrix[i], centroid)[0][0]
-            similarities.append((i, similarity))
-        # Rank sentences
-        ranked_sentences = sorted(similarities, key=lambda x: x[1], reverse=True)
-        # Select top sentences and preserve original order
-        top_sentence_indices = sorted([idx for idx, _ in ranked_sentences[:num_sentences]])
-        return [sentences[i] for i in top_sentence_indices]
-    def abstractive_summarize(
-        self,
-        text: str,
-        max_length: int = 512,
-        min_length: int = 150,
-        num_beams: int = 4,
-        legal_context: bool = True
-    ) -> str:
-        """
-        Generate an abstractive summary of the document.
-        Args:
-            text: Text to summarize
-            max_length: Maximum length of the summary
-            min_length: Minimum length of the summary
-            num_beams: Number of beams to use for beam search
-            legal_context: Add legal domain context to input
-        Returns:
-            Abstractive summary as a string
-        """
-        # Truncate long text to model's maximum input length
-        input_max_length = self.abstractive_tokenizer.model_max_length - 100  # Leave room for summary
-        # Tokenize and truncate
-        input_ids = self.abstractive_tokenizer.encode(
-            text,
-            truncation=True,
-            max_length=input_max_length,
-            return_tensors="pt"
-        ).to(self.device)
-        # Add legal context if requested
-        prefix = "Summarize this legal document: " if legal_context else ""
-        # Generate summary
-        summary_ids = self.abstractive_model.generate(
-            input_ids,
-            max_length=max_length,
-            min_length=min_length,
-            num_beams=num_beams,
-            length_penalty=2.0,
-            early_stopping=True,
-            no_repeat_ngram_size=3
-        )
-        summary = self.abstractive_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        return summary
-    def section_based_summarization(
-        self,
-        document_structure: Dict[str, str],
-        method: str = "hybrid",
-        ratio: float = 0.3
-    ) -> Dict[str, str]:
-        """
-        Summarize each section of a document separately.
-        Args:
-            document_structure: Dictionary with section names as keys and section text as values
-            method: Summarization method ('extractive', 'abstractive', or 'hybrid')
-            ratio: Percentage of sentences to keep for extractive summarization
-        Returns:
-            Dictionary with section names as keys and summaries as values
-        """
-        section_summaries = {}
-        for section_name, section_text in document_structure.items():
-            # Skip empty sections or very short sections
-            if not section_text or len(section_text) < 100:
-                section_summaries[section_name] = section_text
-                continue
-            if method == "extractive":
-                sentences = section_text.split('. ')
-                sentences = [s + '.' for s in sentences if s]
-                summary = ' '.join(self.extractive_summarize(sentences, ratio))
-            elif method == "abstractive":
-                # For short sections, use the original text
-                if len(section_text) < 500:
-                    summary = section_text
-                else:
-                    summary = self.abstractive_summarize(
-                        section_text,
-                        max_length=min(512, max(150, len(section_text) // 3)),
-                        min_length=min(100, max(50, len(section_text) // 5))
-                    )
-            elif method == "hybrid":
-                # For longer sections, first extract important sentences, then generate abstractive summary
-                if len(section_text) < 500:
-                    summary = section_text
-                else:
-                    sentences = section_text.split('. ')
-                    sentences = [s + '.' for s in sentences if s]
-                    extracted_text = ' '.join(self.extractive_summarize(sentences, ratio=0.5))
-                    # If the extracted text is still long, generate abstractive summary
-                    if len(extracted_text) > 1000:
-                        summary = self.abstractive_summarize(
-                            extracted_text,
-                            max_length=min(512, len(extracted_text) // 2),
-                            min_length=min(150, len(extracted_text) // 4)
-                        )
-                    else:
-                        summary = extracted_text
-            else:
-                raise ValueError(f"Unknown summarization method: {method}")
-            section_summaries[section_name] = summary
-        return section_summaries
-    def keyword_extraction(self, text: str, num_keywords: int = 10) -> List[str]:
-        """
-        Extract key legal terms and concepts from text.
-        Args:
-            text: Document text
-            num_keywords: Number of keywords to extract
-        Returns:
-            List of extracted keywords
-        """
-        # Fit and transform the text
-        tfidf_matrix = self.tfidf_vectorizer.fit_transform([text])
-        # Get feature names
-        feature_names = self.tfidf_vectorizer.get_feature_names_out()
-        # Get sorted indices of top-n features
-        indices = np.argsort(tfidf_matrix.toarray()[0])[-num_keywords:]
-        # Get top-n keywords
-        top_keywords = [feature_names[i] for i in indices]
-        return top_keywords[::-1]  # Reverse to get highest score first
-    def highlight_key_sentences(
-        self,
-        text: str,
-        sentences: List[str],
-        num_highlights: int = 5
-    ) -> Dict[str, float]:
-        """
-        Identify and score key sentences for highlighting.
-        Args:
-            text: Full document text
-            sentences: List of sentences
-            num_highlights: Number of sentences to highlight
-        Returns:
-            Dictionary mapping sentences to their importance scores
-        """
-        # Handle case with very few sentences
-        if len(sentences) <= num_highlights:
-            return {s: 1.0 for s in sentences}
-        # Extract keywords
-        keywords = self.keyword_extraction(text, num_keywords=20)
-        # Initialize importance scores
-        scores = {}
-        # Score sentences based on position, length and keyword presence
-        for i, sentence in enumerate(sentences):
-            # Position score (earlier and later sentences tend to be more important)
-            position_score = 1.0
-            if i < len(sentences) * 0.2:  # First 20%
-                position_score = 1.5
-            elif i > len(sentences) * 0.8:  # Last 20%
-                position_score = 1.2
-            # Length score (avoid very short sentences)
-            length_score = min(1.0, len(sentence) / 100)
-            # Keyword score
-            keyword_score = 0
-            for keyword in keywords:
-                if keyword.lower() in sentence.lower():
-                    keyword_score += 1
-            keyword_score = min(1.0, keyword_score / 5)  # Normalize
-            # Combine scores
-            scores[sentence] = (position_score + length_score + keyword_score) / 3
-        # Sort by score and get top sentences
-        sorted_sentences = sorted(scores.items(), key=lambda x: x[1], reverse=True)
-        return dict(sorted_sentences[:num_highlights])
-    def generate_document_summary(
-        self,
-        text: str,
-        document_structure: Optional[Dict[str, str]] = None,
-        method: str = "hybrid",
-        ratio: float = 0.3,
-        include_keywords: bool = True
-    ) -> Dict:
-        """
-        Generate a comprehensive document summary.
-        Args:
-            text: Full document text
-            document_structure: Optional dictionary with section structure
-            method: Summarization method
-            ratio: Extractive summarization ratio
-            include_keywords: Whether to include keywords in the summary
-        Returns:
-            Dictionary containing summary information
-        """
-        result = {}
-        # Generate overall summary
-        if len(text) > 10000:  # For very long documents, use hybrid approach
-            sentences = text.split('. ')
-            sentences = [s + '.' for s in sentences if s]
-            extracted_text = ' '.join(self.extractive_summarize(sentences, ratio=0.3))
-            result["overall_summary"] = self.abstractive_summarize(extracted_text, max_length=512)
-        else:
-            result["overall_summary"] = self.abstractive_summarize(text)
-        # Generate section summaries if structure is provided
-        if document_structure:
-            result["section_summaries"] = self.section_based_summarization(
-                document_structure,
-                method=method,
-                ratio=ratio
-            )
-        # Extract keywords
-        if include_keywords:
-            result["keywords"] = self.keyword_extraction(text, num_keywords=15)
-        # Highlight key sentences
-        sentences = text.split('. ')
-        sentences = [s + '.' for s in sentences if s and len(s) > 20]  # Skip very short fragments
-        result["key_sentences"] = self.highlight_key_sentences(text, sentences)
-        return result
-class LegalLongDocumentSummarizer:
-    """
-    A summarizer designed specifically for long legal documents,
-    using a divide-and-conquer approach with potential for fine-tuning.
-    """
-    def __init__(
-        self,
-        model_name: str = "facebook/bart-large-cnn",
-        max_chunk_length: int = 1024,
-        use_gpu: bool = torch.cuda.is_available()
-    ):
-        """
-        Initialize the long document summarizer.
-        Args:
-            model_name: Model name for the summarizer
-            max_chunk_length: Maximum token length for each chunk
-            use_gpu: Whether to use GPU for inference
-        """
-        self.device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        self.model.to(self.device)
-        self.max_chunk_length = max_chunk_length
-    def summarize_long_document(self, text: str, max_length: int = 512, min_length: int = 150) -> str:
-        """
-        Summarize a long legal document by dividing it into chunks.
-        Args:
-            text: Long document text
-            max_length: Maximum length of the summary
-            min_length: Minimum length of the summary
-        Returns:
-            Combined summary of all chunks
-        """
-        # Split the document into chunks
-        chunks = [text[i:i+self.max_chunk_length] for i in range(0, len(text), self.max_chunk_length)]
-        # Summarize each chunk
-        summaries = []
-        for chunk in chunks:
-            inputs = self.tokenizer(chunk, return_tensors="pt", truncation=True, max_length=self.max_chunk_length).to(self.device)
-            summary_ids = self.model.generate(
-                inputs['input_ids'],
-                max_length=max_length,
-                min_length=min_length,
-                length_penalty=2.0,
-                num_beams=4,
-                early_stopping=True
-            )
-            summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-            summaries.append(summary)
-        # Combine summaries
-        combined_summary = ' '.join(summaries)
-        return combined_summary

+import streamlit as st
 import fitz  # PyMuPDF
 import docx
 import nltk
+import spacy
 import networkx as nx
+import matplotlib.pyplot as plt
+from transformers import pipeline
+from collections import Counter
+# Load NLP Models
+nltk.download("punkt")
 nlp = spacy.load("en_core_web_lg")
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+qa_pipeline = pipeline("question-answering")
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_file):
+    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    text = "\n".join([page.get_text("text") for page in doc])
+    return text
+# Function to extract text from DOCX
+def extract_text_from_docx(docx_file):
+    doc = docx.Document(docx_file)
+    text = "\n".join([para.text for para in doc.paragraphs])
+    return text
+# Summarization function
+def summarize_text(text):
+    return summarizer(text, max_length=200, min_length=50, do_sample=False)[0]["summary_text"]
+# Q&A Function
+def answer_question(text, question):
+    return qa_pipeline({"context": text, "question": question})["answer"]
+# Named Entity Recognition (NER)
+def extract_entities(text):
+    doc = nlp(text)
+    entities = [(ent.text, ent.label_) for ent in doc.ents]
+    return entities
+# Generate Mind Map
+def generate_mind_map(text):
+    doc = nlp(text)
+    entity_counts = Counter([ent.text for ent in doc.ents])
+    G = nx.Graph()
+    for entity, count in entity_counts.items():
+        G.add_node(entity, size=count * 100)
+    pos = nx.spring_layout(G)
+    plt.figure(figsize=(10, 7))
+    nx.draw(G, pos, with_labels=True, node_size=[G.nodes[n]['size'] for n in G.nodes], node_color="skyblue")
+    plt.title("Mind Map of Entities")
+    st.pyplot(plt)
+# Streamlit UI
+st.set_page_config(page_title="Legal Document Summarizer & Query System", layout="wide")
+st.title("📜 Legal Document Summarization, NER & Mind Map System")
+st.markdown("""Upload a legal document, get a summary, extract entities, and generate a mind map!""")
+# File uploader
+uploaded_file = st.file_uploader("Upload a PDF or DOCX", type=["pdf", "docx"])
+if uploaded_file:
+    if uploaded_file.type == "application/pdf":
+        document_text = extract_text_from_pdf(uploaded_file)
+    elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        document_text = extract_text_from_docx(uploaded_file)
+    else:
+        st.error("Unsupported file format!")
+        st.stop()
+    st.subheader("Extracted Text Preview")
+    st.text_area("Document Content", document_text[:2000], height=250)
+    # Summarization
+    if st.button("Summarize Document"):
+        summary = summarize_text(document_text)
+        st.subheader("📌 Summary")
+        st.success(summary)
+    # Question Answering
+    user_question = st.text_input("Ask a question about the document:")
+    if user_question:
+        answer = answer_question(document_text, user_question)
+        st.subheader("📝 Answer")
+        st.info(answer)
+    # Named Entity Recognition
+    if st.button("Extract Entities"):
+        entities = extract_entities(document_text)
+        st.subheader("📌 Named Entities")
+        for entity, label in entities:
+            st.write(f"**{entity}** - {label}")
+    # Mind Map Generation
+    if st.button("Generate Mind Map"):
+        st.subheader("🧠 Mind Map of Entities")
+        generate_mind_map(document_text)
+st.markdown("---")
+st.caption("🚀 Built with Hugging Face, spaCy, and Streamlit")