OpenDeepResearch

Runtime error

App Files Files Community

Leonardo commited on Mar 27, 2025

Commit

9a5117b

verified ·

1 Parent(s): 66af00d

Create scripts/legal_document_tool.py

Browse files

Files changed (1) hide show

scripts/legal_document_tool.py +505 -0

scripts/legal_document_tool.py ADDED Viewed

	@@ -0,0 +1,505 @@

+"""
+Legal Document Processing Tool for Smolagents
+This tool processes legal documents with specialized models for legal text,
+optimizing for citation retention, multilingual support, and performance on
+legal-specific retrieval tasks.
+Author: Dr. Zhou Wang
+"""
+from typing import Dict, List, Any, Optional, Union
+import os
+import re
+import time
+import tempfile
+import numpy as np
+from tqdm import tqdm
+# Import Smolagents Tool class
+from smolagents import Tool
+# Import NLP components
+try:
+    from sklearn.metrics.pairwise import cosine_similarity
+    from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Document
+    from llama_index.core.node_parser import MarkdownNodeParser
+    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+    from llama_index.core.ingestion import IngestionPipeline
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+except ImportError:
+    raise ImportError(
+        "Required dependencies not found. Please install with: "
+        "pip install llama-index langchain scikit-learn tqdm"
+    )
+# Model configurations based on research findings
+LEGAL_MODELS = {
+    "legal-bert": {
+        "name": "nlp-jurisprudence/legal-bert-base-uncased",
+        "description": "Trained on ECtHR legal documents, specialized in human rights law",
+        "max_length": 512,
+        "requires_gpu": True,
+    },
+    "multi-qa-mpnet": {
+        "name": "sentence-transformers/multi-qa-mpnet-base-dot-v1",
+        "description": "Optimized for legal Q&A retrieval with cross-lingual support",
+        "max_length": 512,
+        "requires_gpu": False,
+    },
+    "legal-xlm-roberta": {
+        "name": "joelito/legal-xlm-roberta-base",
+        "description": "Multilingual legal model with EU legislation and RFC/ISO pattern awareness",
+        "max_length": 512,
+        "requires_gpu": True,
+    },
+    "multilingual-e5": {
+        "name": "intfloat/multilingual-e5-base",
+        "description": "Dense retrieval optimized with citation context preservation",
+        "max_length": 512,
+        "requires_gpu": True,
+    },
+    "all-mpnet": {
+        "name": "sentence-transformers/all-mpnet-base-v2",
+        "description": "General purpose embedding model, good baseline for legal text",
+        "max_length": 512,
+        "requires_gpu": False,
+    },
+}
+class LegalDocumentProcessor:
+    """
+    Processor for legal documents with specialized models,
+    citation preservation, and benchmarking capabilities.
+    """
+    def __init__(
+        self,
+        model_key: str = "legal-xlm-roberta",
+        use_gpu: bool = False,
+        chunk_size: int = 512,
+        chunk_overlap: int = 100,
+    ):
+        """
+        Initialize the legal document processor.
+        Args:
+            model_key: Key for the model to use from LEGAL_MODELS dictionary
+            use_gpu: Whether to use GPU for embeddings (if available)
+            chunk_size: Size of text chunks for processing
+            chunk_overlap: Overlap between chunks to preserve context
+        """
+        # Validate and set up model
+        if model_key not in LEGAL_MODELS:
+            print(
+                f"Warning: Model '{model_key}' not found. Using legal-xlm-roberta as default."
+            )
+            model_key = "legal-xlm-roberta"
+        model_config = LEGAL_MODELS[model_key]
+        device = "cuda" if use_gpu and model_config["requires_gpu"] else "cpu"
+        # Initialize embedding model
+        self.embed_model = HuggingFaceEmbedding(
+            model_name=model_config["name"],
+            device=device,
+            tokenizer_kwargs={
+                "trust_remote_code": True,
+                "max_length": model_config["max_length"],
+                "truncation": True,
+            },
+        )
+        # Store model information for reference
+        self.model_info = model_config
+        self.model_key = model_key
+        # Legal document-optimized text splitter with improved chunk size
+        self.splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=[
+                "\n## ",
+                "\n### ",
+                "\n#### ",  # Headers
+                "\n\n",
+                "\n",  # Paragraphs
+                ". ",
+                "! ",
+                "? ",  # Sentences
+                ";",
+                ":",  # Clause boundaries
+                " ",  # Last resort
+            ],
+        )
+        # Pattern for removing footers from legal documents
+        # Separated into individual patterns for better maintainability
+        self.footer_patterns = [
+            r"^Page\s\d+(\s+of\s+\d+)?$",  # Page numbers
+            r"^©.*\b(Company|Inc|Ltd)\b.*$",  # Copyright lines
+            r"^All rights reserved.*?$",  # Legal boilerplate
+            r"^-+$",  # Separator lines
+            r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}(:\d{2})?$",  # Timestamps
+            r"(?i)^(confidential|proprietary|internal use only)",  # Security tags
+        ]
+        # Join all patterns with the OR operator
+        combined_pattern = "|".join(f"({pattern})" for pattern in self.footer_patterns)
+        # Compile the combined pattern
+        self.footer_pattern = re.compile(
+            combined_pattern, flags=re.MULTILINE | re.IGNORECASE
+        )
+    def remove_footers(self, text: str) -> str:
+        """
+        Remove common document footer patterns from text.
+        Args:
+            text: The input text to process
+        Returns:
+            Text with footer patterns removed
+        """
+        return self.footer_pattern.sub("", text)
+    def clean_text(self, text: str) -> str:
+        """
+        Preserve legal citations while cleaning artifacts.
+        Args:
+            text: The input text to clean
+        Returns:
+            Cleaned text with citations preserved
+        """
+        # First remove footers
+        text = self.remove_footers(text)
+        # Preserve citation patterns
+        # Pattern 1: Footnote numbers (e.g., 98, 99, 100)
+        cleaned = re.sub(r"(?<=\D)(\d{2,3})(?=\D)", r"[\1]", text)
+        # Pattern 2: Case citations [2019] UKSC 20
+        # Already well-structured, so no changes needed
+        # Pattern 3: Standardize quotation marks
+        cleaned = cleaned.replace("''", '"').replace("``", '"')
+        # Pattern 4: Handle section references (§3.1, §123)
+        cleaned = re.sub(r"§(\d+(\.\d+)?)", r"Section \1", cleaned)
+        # Pattern 5: Handle legal abbreviations (e.g., Art. -> Article)
+        cleaned = re.sub(r"\bArt\.\s+(\d+)", r"Article \1", cleaned)
+        # Pattern 6: Standardize case names with v. and vs.
+        cleaned = re.sub(r"\bv\s+", r"v. ", cleaned)
+        cleaned = re.sub(r"\bvs\s+", r"v. ", cleaned)
+        # Pattern 7: RFC/ISO pattern standardization (RFC 1234, ISO 9001)
+        cleaned = re.sub(r"\b(RFC|ISO)\s*[:#]?\s*(\d+)", r"\1 \2", cleaned)
+        return cleaned
+    def create_pipeline(self) -> IngestionPipeline:
+        """
+        Create a document processing pipeline.
+        Returns:
+            Configured IngestionPipeline object
+        """
+        return IngestionPipeline(
+            transformations=[
+                self.clean_text,
+                MarkdownNodeParser(),
+                self.splitter,
+                self.embed_model,
+            ]
+        )
+    def validate_citation_retention(
+        self, documents: List[Document]
+    ) -> Dict[str, float]:
+        """
+        Measure semantic similarity of citations before/after text cleaning.
+        Args:
+            documents: List of Document objects to validate
+        Returns:
+            Dictionary with validation metrics
+        """
+        if not documents:
+            return {"citation_retention": 0.0, "processing_time": 0.0}
+        start_time = time.time()
+        # Extract original texts
+        original_texts = [doc.text for doc in documents[:5]]  # Sample for performance
+        # Apply cleaning
+        processed_texts = [self.clean_text(text) for text in original_texts]
+        # Calculate embeddings
+        try:
+            # Direct access to the underlying HuggingFace model
+            orig_embeds = self.embed_model._model.encode(original_texts)
+            proc_embeds = self.embed_model._model.encode(processed_texts)
+            # Calculate similarity
+            similarities = cosine_similarity(orig_embeds, proc_embeds).diagonal()
+            avg_similarity = float(np.mean(similarities))
+            processing_time = time.time() - start_time
+            return {
+                "citation_retention": avg_similarity * 100,  # As percentage
+                "processing_time": processing_time,
+                "sample_size": len(original_texts),
+            }
+        except Exception as e:
+            return {"citation_retention": 0.0, "processing_time": 0.0, "error": str(e)}
+    def process_documents(self, documents: List[Document]) -> Dict[str, Any]:
+        """
+        Process a list of legal documents.
+        Args:
+            documents: List of Document objects to process
+        Returns:
+            Dictionary with processing results and stats
+        """
+        if not documents:
+            return {"status": "error", "message": "No documents provided"}
+        try:
+            # Create pipeline and process documents
+            pipeline = self.create_pipeline()
+            nodes = pipeline.run(documents=documents)
+            # Create vector index
+            index = VectorStoreIndex(nodes)
+            query_engine = index.as_query_engine()
+            # Return success with stats
+            return {
+                "status": "success",
+                "nodes_count": len(nodes),
+                "documents_count": len(documents),
+                "model_used": self.model_key,
+                "query_engine": query_engine,  # This will be used for querying
+            }
+        except Exception as e:
+            return {"status": "error", "message": str(e)}
+class LegalDocumentTool(Tool):
+    """
+    Tool for processing legal documents with specialized models and querying capabilities.
+    """
+    name = "legal_document_processor"
+    description = (
+        "Processes legal documents with specialized models for legal text, optimizing for "
+        "citation retention, multilingual support, and performance on legal-specific retrieval tasks. "
+        "Can process text or file inputs and provide enhanced query capabilities."
+    )
+    inputs = {
+        "text": {
+            "type": "string",
+            "description": "Legal document text to process. Provide either text or file_paths.",
+            "optional": True,
+        },
+        "file_paths": {
+            "type": "string",
+            "description": "Comma-separated list of file paths or a directory path containing legal documents. Provide either text or file_paths.",
+            "optional": True,
+        },
+        "model_key": {
+            "type": "string",
+            "description": "Legal embedding model to use. Options: legal-bert, multi-qa-mpnet, legal-xlm-roberta, multilingual-e5, all-mpnet",
+            "default": "legal-xlm-roberta",
+        },
+        "query": {
+            "type": "string",
+            "description": "Optional query to run against the processed documents.",
+            "optional": True,
+        },
+        "validate_citations": {
+            "type": "boolean",
+            "description": "Whether to validate citation retention in the processed documents.",
+            "default": False,
+        },
+        "use_gpu": {
+            "type": "boolean",
+            "description": "Whether to use GPU for embedding calculations if available.",
+            "default": False,
+        },
+    }
+    output_type = "string"
+    def _load_documents(self, input_path: str) -> List[Document]:
+        """
+        Load documents from a file path or directory.
+        Args:
+            input_path: Path to a file or directory
+        Returns:
+            List of Document objects
+        """
+        if os.path.isfile(input_path):
+            # Create a SimpleDirectoryReader for the file's directory
+            # and filter to only include this file
+            directory = os.path.dirname(input_path)
+            filename = os.path.basename(input_path)
+            return SimpleDirectoryReader(
+                input_dir=directory,
+                required_exts=[
+                    os.path.splitext(filename)[1][1:]
+                ],  # Extension without dot
+                filename_as_id=True,
+            ).load_data()
+        elif os.path.isdir(input_path):
+            return SimpleDirectoryReader(
+                input_dir=input_path,
+                filename_as_id=True,
+            ).load_data()
+        else:
+            raise ValueError(f"Path not found: {input_path}")
+    def _create_document_from_text(self, text: str) -> List[Document]:
+        """
+        Create a Document object from text.
+        Args:
+            text: Text content
+        Returns:
+            List containing a single Document object
+        """
+        # Create a temporary file to store the text
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".md", delete=False
+        ) as temp_file:
+            temp_file.write(text)
+            temp_path = temp_file.name
+        try:
+            # Load the document from the temporary file
+            documents = self._load_documents(temp_path)
+            return documents
+        finally:
+            # Clean up the temporary file
+            os.remove(temp_path)
+    def forward(
+        self,
+        text: Optional[str] = None,
+        file_paths: Optional[str] = None,
+        model_key: str = "legal-xlm-roberta",
+        query: Optional[str] = None,
+        validate_citations: bool = False,
+        use_gpu: bool = False,
+    ) -> str:
+        """
+        Process legal documents and optionally run a query.
+        Args:
+            text: Legal document text to process
+            file_paths: Comma-separated list of file paths or a directory path
+            model_key: Legal embedding model to use
+            query: Optional query to run against the processed documents
+            validate_citations: Whether to validate citation retention
+            use_gpu: Whether to use GPU for embeddings
+        Returns:
+            Processing results or query response as a string
+        """
+        # Validate inputs
+        if not text and not file_paths:
+            return "Error: Either text or file_paths must be provided."
+        try:
+            # Initialize processor
+            processor = LegalDocumentProcessor(
+                model_key=model_key,
+                use_gpu=use_gpu,
+            )
+            # Load documents
+            documents = []
+            if text:
+                documents.extend(self._create_document_from_text(text))
+            if file_paths:
+                # Handle comma-separated paths
+                paths = [path.strip() for path in file_paths.split(",")]
+                for path in paths:
+                    try:
+                        docs = self._load_documents(path)
+                        documents.extend(docs)
+                    except Exception as e:
+                        return f"Error loading documents from {path}: {str(e)}"
+            # Check if we have documents to process
+            if not documents:
+                return "Error: No valid documents found."
+            # Validate citations if requested
+            validation_results = {}
+            if validate_citations:
+                validation_results = processor.validate_citation_retention(documents)
+            # Process documents
+            result = processor.process_documents(documents)
+            if result["status"] != "success":
+                return f"Processing error: {result['message']}"
+            # Run query if provided
+            if query and "query_engine" in result:
+                query_engine = result["query_engine"]
+                response = query_engine.query(query)
+                # Format the response
+                output = f"Query: {query}\n\nResponse: {response}\n\n"
+                output += f"Documents processed: {result['documents_count']}\n"
+                output += f"Text chunks: {result['nodes_count']}\n"
+                output += f"Model used: {result['model_used']}\n"
+                # Add validation results if available
+                if validation_results:
+                    output += "\n=== Citation Retention Validation ===\n"
+                    output += f"Citation retention: {validation_results.get('citation_retention', 0):.2f}%\n"
+                    output += f"Processing time: {validation_results.get('processing_time', 0):.2f} seconds\n"
+                return output
+            # If no query, return processing stats
+            output = "Document processing complete.\n\n"
+            output += f"Documents processed: {result['documents_count']}\n"
+            output += f"Text chunks: {result['nodes_count']}\n"
+            output += f"Model used: {result['model_used']}\n"
+            # Add validation results if available
+            if validation_results:
+                output += "\n=== Citation Retention Validation ===\n"
+                output += f"Citation retention: {validation_results.get('citation_retention', 0):.2f}%\n"
+                output += f"Processing time: {validation_results.get('processing_time', 0):.2f} seconds\n"
+            output += "\nThe documents are now ready for querying. Use the 'query' parameter to run a query."
+            return output
+        except Exception as e:
+            return f"Error: {str(e)}"